# Library

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Scraping Data
import requests, time, os
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime, timedelta

In [None]:
# Pre-processing Data
# Case Folding & Cleaning Data
import pandas as pd
import glob
import os
import re

# Sentence Splitting
import stanza
import pandas as pd
from tqdm.auto import tqdm

# Normalisasi
import pandas as pd
from tqdm import tqdm
from indo_normalizer import Normalizer

In [None]:
# Ekstraksi Aspek
import stanza
import pandas as pd

In [None]:
# Klasifikasi Sentimen
import torch
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from tqdm import tqdm

In [None]:
# Metode Balas
import heapq
import math

# Scraping Data

## Emina

In [None]:
# LIBRARY
import requests, time, os
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime, timedelta

# CONFIG
product_name = "Emina"
expected_product = "Sun Protection SPF 30 PA+++"
expected_brand = "Emina"

start_page = 1
end_page = 300
max_reviews = 500

base_url = "https://reviews.femaledaily.com/products/moisturizer/sun-protection-1/emina/sun-protection-spf-30-pa?cat=&cat_id=0&age_range=&skin_type=&skin_tone=&skin_undertone=&hair_texture=&hair_type=&order=newest&page={}"
headers = {"User-Agent": "Mozilla/5.0"}

all_reviews = []

# SCRAPING
for page in range(start_page, end_page + 1):

    if len(all_reviews) >= max_reviews:
        print("Stop: sudah terkumpul 500 review.")
        break

    print(f"\n================ PAGE {page} ================")

    retry = 0
    max_retry = 3
    review_cards = []

    while retry < max_retry:
        print(f"Mengambil halaman {page} (Percobaan {retry+1}/{max_retry})")
        url = base_url.format(page)
        response = requests.get(url, headers=headers)

        if response.status_code != 200:
            print("Status code gagal, coba lagi.")
            retry += 1
            time.sleep(2)
            continue

        soup = BeautifulSoup(response.text, "lxml")

        # VALIDASI PRODUK
        brand_tag = soup.select_one("h2.product-brand")
        name_tag = soup.select_one("h1.product-name")

        if brand_tag and name_tag:
            brand = brand_tag.get_text(strip=True)
            prod_name = name_tag.get_text(strip=True)

            if expected_brand not in brand or expected_product not in prod_name:
                print("Bukan halaman produk yang sesuai — SKIP")
                break

        # AMBIL CARD REVIEW
        review_cards = soup.select("div.review-card")

        if review_cards:
            print(f"Ditemukan {len(review_cards)} review pada percobaan ke-{retry+1}")
            break
        else:
            print("Tidak menemukan review. Coba ulang.")
            retry += 1
            time.sleep(2)

    if not review_cards:
        print("Tetap kosong setelah 3x percobaan. Halaman dilewati.")
        continue

    # EXTRACT REVIEW
    for card in review_cards:
        if len(all_reviews) >= max_reviews:
            break

        text_tag = card.select_one("p.text-content")
        review_text = text_tag.get_text(" ").strip() if text_tag else "N/A"

        stars = card.select("div.review-card-rating-wrapper i.icon-ic_big_star_full")
        rating = len(stars)

        date_tag = card.select_one("p.review-date")
        raw_date = date_tag.get_text(strip=True)

        parsed_date = None

        try:
            parsed_date = datetime.strptime(raw_date, "%d %b %Y")
        except:
            if "days ago" in raw_date:
                parsed_date = datetime.now() - timedelta(days=int(raw_date.split()[0]))
            elif "hours ago" in raw_date:
                parsed_date = datetime.now() - timedelta(hours=int(raw_date.split()[0]))
            elif "minutes ago" in raw_date:
                parsed_date = datetime.now() - timedelta(minutes=int(raw_date.split()[0]))
            elif "yesterday" in raw_date.lower():
                parsed_date = datetime.now() - timedelta(days=1)
            elif "today" in raw_date.lower():
                parsed_date = datetime.now()

        formatted_date = parsed_date.strftime("%Y-%m-%d") if parsed_date else "UNKNOWN"

        all_reviews.append({
            "Review Text": review_text,
            "Rating": rating,
            "Date": formatted_date
        })

    print(f"Total terkumpul sekarang: {len(all_reviews)}")
    time.sleep(2)

# SAVE
df = pd.DataFrame(all_reviews)

folder = "/content/drive/MyDrive/FD_Scraping_500"
os.makedirs(folder, exist_ok=True)

save_path = f"{folder}/{product_name}_500reviews_2.xlsx"
df.to_excel(save_path, index=False, encoding='utf-8-sig')

print("\nSELESAI!")
print("File tersimpan:", save_path)
print("Total review:", len(df))

# DISTRIBUSI TAHUN
df['Year'] = pd.to_datetime(df['Date'], errors='coerce').dt.year
print("\nDistribusi review per tahun:")
print(df['Year'].value_counts(dropna=False))

## Azarine

In [None]:
# LIBRARY
import requests, time, os
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime, timedelta

# CONFIG
product_name = "AzarineCosmetic"
expected_product = "Hydrasoothe Sunscreen Gel SPF 45+++"
expected_brand = "Azarine Cosmetic"

start_page = 1
end_page = 300
max_reviews = 500

base_url = "https://reviews.femaledaily.com/products/moisturizer/sun-protection-1/azarine-cosmetic/hydrashoothe-sunscreen-gel-spf45-3?cat=&cat_id=0&age_range=&skin_type=&skin_tone=&skin_undertone=&hair_texture=&hair_type=&order=newest&page={}"
headers = {"User-Agent": "Mozilla/5.0"}

all_reviews = []

# SCRAPING
for page in range(start_page, end_page + 1):

    if len(all_reviews) >= max_reviews:
        print("Stop: sudah terkumpul 500 review.")
        break

    print(f"\n================ PAGE {page} ================")

    retry = 0
    max_retry = 3
    review_cards = []

    while retry < max_retry:
        print(f"Mengambil halaman {page} (Percobaan {retry+1}/{max_retry})")
        url = base_url.format(page)
        response = requests.get(url, headers=headers)

        if response.status_code != 200:
            print("Status code gagal, coba lagi.")
            retry += 1
            time.sleep(2)
            continue

        soup = BeautifulSoup(response.text, "lxml")

        # VALIDASI PRODUK
        brand_tag = soup.select_one("h2.product-brand")
        name_tag = soup.select_one("h1.product-name")

        if brand_tag and name_tag:
            brand = brand_tag.get_text(strip=True)
            prod_name = name_tag.get_text(strip=True)

            if expected_brand not in brand or expected_product not in prod_name:
                print("Bukan halaman produk yang sesuai — SKIP")
                break

        # AMBIL CARD REVIEW
        review_cards = soup.select("div.review-card")

        if review_cards:
            print(f"Ditemukan {len(review_cards)} review pada percobaan ke-{retry+1}")
            break
        else:
            print("Tidak menemukan review. Coba ulang.")
            retry += 1
            time.sleep(2)

    if not review_cards:
        print("Tetap kosong setelah 3x percobaan. Halaman dilewati.")
        continue

    # EXTRACT REVIEW
    for card in review_cards:
        if len(all_reviews) >= max_reviews:
            break

        text_tag = card.select_one("p.text-content")
        review_text = text_tag.get_text(" ").strip() if text_tag else "N/A"

        stars = card.select("div.review-card-rating-wrapper i.icon-ic_big_star_full")
        rating = len(stars)

        date_tag = card.select_one("p.review-date")
        raw_date = date_tag.get_text(strip=True)

        parsed_date = None

        try:
            parsed_date = datetime.strptime(raw_date, "%d %b %Y")
        except:
            if "days ago" in raw_date:
                parsed_date = datetime.now() - timedelta(days=int(raw_date.split()[0]))
            elif "hours ago" in raw_date:
                parsed_date = datetime.now() - timedelta(hours=int(raw_date.split()[0]))
            elif "minutes ago" in raw_date:
                parsed_date = datetime.now() - timedelta(minutes=int(raw_date.split()[0]))
            elif "yesterday" in raw_date.lower():
                parsed_date = datetime.now() - timedelta(days=1)
            elif "today" in raw_date.lower():
                parsed_date = datetime.now()

        formatted_date = parsed_date.strftime("%Y-%m-%d") if parsed_date else "UNKNOWN"

        all_reviews.append({
            "Review Text": review_text,
            "Rating": rating,
            "Date": formatted_date
        })

    print(f"Total terkumpul sekarang: {len(all_reviews)}")
    time.sleep(2)

# SAVE
df = pd.DataFrame(all_reviews)

folder = "/content/drive/MyDrive/FD_Scraping_500"
os.makedirs(folder, exist_ok=True)

save_path = f"{folder}/{product_name}_500reviews.xlsx"
df.to_excel(save_path, index=False, encoding='utf-8-sig')

print("\nSELESAI!")
print("File tersimpan:", save_path)
print("Total review:", len(df))

# DISTRIBUSI TAHUN
df['Year'] = pd.to_datetime(df['Date'], errors='coerce').dt.year
print("\nDistribusi review per tahun:")
print(df['Year'].value_counts(dropna=False))

## Skin Aqua Milk

In [None]:
# LIBRARY
import requests, time, os
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime, timedelta

# CONFIG
product_name = "SkinAquaMilk"
expected_product = "UV Moisture Milk"
expected_brand = "Skin Aqua"

start_page = 1
end_page = 300
max_reviews = 500

base_url = "https://reviews.femaledaily.com/products/moisturizer/sun-protection-1/skin-aqua/uv-moisture-milk?cat=&cat_id=0&age_range=&skin_type=&skin_tone=&skin_undertone=&hair_texture=&hair_type=&order=newest&page={}"
headers = {"User-Agent": "Mozilla/5.0"}

all_reviews = []

# SCRAPING
for page in range(start_page, end_page + 1):

    if len(all_reviews) >= max_reviews:
        print("Stop: sudah terkumpul 500 review.")
        break

    print(f"\n================ PAGE {page} ================")

    retry = 0
    max_retry = 3
    review_cards = []

    while retry < max_retry:
        print(f"Mengambil halaman {page} (Percobaan {retry+1}/{max_retry})")
        url = base_url.format(page)
        response = requests.get(url, headers=headers)

        if response.status_code != 200:
            print("Status code gagal, coba lagi.")
            retry += 1
            time.sleep(2)
            continue

        soup = BeautifulSoup(response.text, "lxml")

        # VALIDASI PRODUK
        brand_tag = soup.select_one("h2.product-brand")
        name_tag = soup.select_one("h1.product-name")

        if brand_tag and name_tag:
            brand = brand_tag.get_text(strip=True)
            prod_name = name_tag.get_text(strip=True)

            if expected_brand not in brand or expected_product not in prod_name:
                print("Bukan halaman produk yang sesuai — SKIP")
                break

        # AMBIL CARD REVIEW
        review_cards = soup.select("div.review-card")

        if review_cards:
            print(f"Ditemukan {len(review_cards)} review pada percobaan ke-{retry+1}")
            break
        else:
            print("Tidak menemukan review. Coba ulang.")
            retry += 1
            time.sleep(2)

    if not review_cards:
        print("Tetap kosong setelah 3x percobaan. Halaman dilewati.")
        continue

    # EXTRACT REVIEW
    for card in review_cards:
        if len(all_reviews) >= max_reviews:
            break

        text_tag = card.select_one("p.text-content")
        review_text = text_tag.get_text(" ").strip() if text_tag else "N/A"

        stars = card.select("div.review-card-rating-wrapper i.icon-ic_big_star_full")
        rating = len(stars)

        date_tag = card.select_one("p.review-date")
        raw_date = date_tag.get_text(strip=True)

        parsed_date = None

        try:
            parsed_date = datetime.strptime(raw_date, "%d %b %Y")
        except:
            if "days ago" in raw_date:
                parsed_date = datetime.now() - timedelta(days=int(raw_date.split()[0]))
            elif "hours ago" in raw_date:
                parsed_date = datetime.now() - timedelta(hours=int(raw_date.split()[0]))
            elif "minutes ago" in raw_date:
                parsed_date = datetime.now() - timedelta(minutes=int(raw_date.split()[0]))
            elif "yesterday" in raw_date.lower():
                parsed_date = datetime.now() - timedelta(days=1)
            elif "today" in raw_date.lower():
                parsed_date = datetime.now()

        formatted_date = parsed_date.strftime("%Y-%m-%d") if parsed_date else "UNKNOWN"

        all_reviews.append({
            "Review Text": review_text,
            "Rating": rating,
            "Date": formatted_date
        })

    print(f"Total terkumpul sekarang: {len(all_reviews)}")
    time.sleep(2)

# SAVE
df = pd.DataFrame(all_reviews)

folder = "/content/drive/MyDrive/FD_Scraping_500"
os.makedirs(folder, exist_ok=True)

save_path = f"{folder}/{product_name}_500reviews.xlsx"
df.to_excel(save_path, index=False, encoding='utf-8-sig')

print("\nSELESAI!")
print("File tersimpan:", save_path)
print("Total review:", len(df))

# DISTRIBUSI TAHUN
df['Year'] = pd.to_datetime(df['Date'], errors='coerce').dt.year
print("\nDistribusi review per tahun:")
print(df['Year'].value_counts(dropna=False))

## Biore

In [None]:
# LIBRARY
import requests, time, os
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime, timedelta

# CONFIG
product_name = "Biore"
expected_product = "UV Aqua Rich Watery Essence SPF 50+ PA++++"
expected_brand = "Biore"

start_page = 1
end_page = 300
max_reviews = 500

base_url = "https://reviews.femaledaily.com/products/moisturizer/sun-protection-1/biore/uv-aqua-rich-watery-essence-1?cat=&cat_id=0&age_range=&skin_type=&skin_tone=&skin_undertone=&hair_texture=&hair_type=&order=newest&page={}"
headers = {"User-Agent": "Mozilla/5.0"}

all_reviews = []

# SCRAPING
for page in range(start_page, end_page + 1):

    if len(all_reviews) >= max_reviews:
        print("Stop: sudah terkumpul 500 review.")
        break

    print(f"\n================ PAGE {page} ================")

    retry = 0
    max_retry = 3
    review_cards = []

    while retry < max_retry:
        print(f"Mengambil halaman {page} (Percobaan {retry+1}/{max_retry})")
        url = base_url.format(page)
        response = requests.get(url, headers=headers)

        if response.status_code != 200:
            print("Status code gagal, coba lagi.")
            retry += 1
            time.sleep(2)
            continue

        soup = BeautifulSoup(response.text, "lxml")

        # VALIDASI PRODUK
        brand_tag = soup.select_one("h2.product-brand")
        name_tag = soup.select_one("h1.product-name")

        if brand_tag and name_tag:
            brand = brand_tag.get_text(strip=True)
            prod_name = name_tag.get_text(strip=True)

            if expected_brand not in brand or expected_product not in prod_name:
                print("Bukan halaman produk yang sesuai — SKIP")
                break

        # AMBIL CARD REVIEW
        review_cards = soup.select("div.review-card")

        if review_cards:
            print(f"Ditemukan {len(review_cards)} review pada percobaan ke-{retry+1}")
            break
        else:
            print("Tidak menemukan review. Coba ulang.")
            retry += 1
            time.sleep(2)

    if not review_cards:
        print("Tetap kosong setelah 3x percobaan. Halaman dilewati.")
        continue

    # EXTRACT REVIEW
    for card in review_cards:
        if len(all_reviews) >= max_reviews:
            break

        text_tag = card.select_one("p.text-content")
        review_text = text_tag.get_text(" ").strip() if text_tag else "N/A"

        stars = card.select("div.review-card-rating-wrapper i.icon-ic_big_star_full")
        rating = len(stars)

        date_tag = card.select_one("p.review-date")
        raw_date = date_tag.get_text(strip=True)

        parsed_date = None

        try:
            parsed_date = datetime.strptime(raw_date, "%d %b %Y")
        except:
            if "days ago" in raw_date:
                parsed_date = datetime.now() - timedelta(days=int(raw_date.split()[0]))
            elif "hours ago" in raw_date:
                parsed_date = datetime.now() - timedelta(hours=int(raw_date.split()[0]))
            elif "minutes ago" in raw_date:
                parsed_date = datetime.now() - timedelta(minutes=int(raw_date.split()[0]))
            elif "yesterday" in raw_date.lower():
                parsed_date = datetime.now() - timedelta(days=1)
            elif "today" in raw_date.lower():
                parsed_date = datetime.now()

        formatted_date = parsed_date.strftime("%Y-%m-%d") if parsed_date else "UNKNOWN"

        all_reviews.append({
            "Review Text": review_text,
            "Rating": rating,
            "Date": formatted_date
        })

    print(f"Total terkumpul sekarang: {len(all_reviews)}")
    time.sleep(2)

# SAVE
df = pd.DataFrame(all_reviews)

folder = "/content/drive/MyDrive/FD_Scraping_500"
os.makedirs(folder, exist_ok=True)

save_path = f"{folder}/{product_name}_500reviews.xlsx"
df.to_excel(save_path, index=False, encoding='utf-8-sig')

print("\nSELESAI!")
print("File tersimpan:", save_path)
print("Total review:", len(df))

# DISTRIBUSI TAHUN
df['Year'] = pd.to_datetime(df['Date'], errors='coerce').dt.year
print("\nDistribusi review per tahun:")
print(df['Year'].value_counts(dropna=False))

## NPURE

In [None]:
# LIBRARY
import requests, time, os
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime, timedelta

# CONFIG
product_name = "NPURE"
expected_product = "Cica Beat The Sun"
expected_brand = "NPURE"

start_page = 1
end_page = 300
max_reviews = 500

base_url = "https://reviews.femaledaily.com/products/moisturizer/sun-protection-1/npure/cica-beat-the-sun?cat=&cat_id=0&age_range=&skin_type=&skin_tone=&skin_undertone=&hair_texture=&hair_type=&order=newest&page={}"
headers = {"User-Agent": "Mozilla/5.0"}

all_reviews = []

# SCRAPING
for page in range(start_page, end_page + 1):

    if len(all_reviews) >= max_reviews:
        print("Stop: sudah terkumpul 500 review.")
        break

    print(f"\n================ PAGE {page} ================")

    retry = 0
    max_retry = 3
    review_cards = []

    while retry < max_retry:
        print(f"Mengambil halaman {page} (Percobaan {retry+1}/{max_retry})")
        url = base_url.format(page)
        response = requests.get(url, headers=headers)

        if response.status_code != 200:
            print("Status code gagal, coba lagi.")
            retry += 1
            time.sleep(2)
            continue

        soup = BeautifulSoup(response.text, "lxml")

        # VALIDASI PRODUK
        brand_tag = soup.select_one("h2.product-brand")
        name_tag = soup.select_one("h1.product-name")

        if brand_tag and name_tag:
            brand = brand_tag.get_text(strip=True)
            prod_name = name_tag.get_text(strip=True)

            if expected_brand not in brand or expected_product not in prod_name:
                print("Bukan halaman produk yang sesuai — SKIP")
                break

        # AMBIL CARD REVIEW
        review_cards = soup.select("div.review-card")

        if review_cards:
            print(f"Ditemukan {len(review_cards)} review pada percobaan ke-{retry+1}")
            break
        else:
            print("Tidak menemukan review. Coba ulang.")
            retry += 1
            time.sleep(2)

    if not review_cards:
        print("Tetap kosong setelah 3x percobaan. Halaman dilewati.")
        continue

    # EXTRACT REVIEW
    for card in review_cards:
        if len(all_reviews) >= max_reviews:
            break

        text_tag = card.select_one("p.text-content")
        review_text = text_tag.get_text(" ").strip() if text_tag else "N/A"

        stars = card.select("div.review-card-rating-wrapper i.icon-ic_big_star_full")
        rating = len(stars)

        date_tag = card.select_one("p.review-date")
        raw_date = date_tag.get_text(strip=True)

        parsed_date = None

        try:
            parsed_date = datetime.strptime(raw_date, "%d %b %Y")
        except:
            if "days ago" in raw_date:
                parsed_date = datetime.now() - timedelta(days=int(raw_date.split()[0]))
            elif "hours ago" in raw_date:
                parsed_date = datetime.now() - timedelta(hours=int(raw_date.split()[0]))
            elif "minutes ago" in raw_date:
                parsed_date = datetime.now() - timedelta(minutes=int(raw_date.split()[0]))
            elif "yesterday" in raw_date.lower():
                parsed_date = datetime.now() - timedelta(days=1)
            elif "today" in raw_date.lower():
                parsed_date = datetime.now()

        formatted_date = parsed_date.strftime("%Y-%m-%d") if parsed_date else "UNKNOWN"

        all_reviews.append({
            "Review Text": review_text,
            "Rating": rating,
            "Date": formatted_date
        })

    print(f"Total terkumpul sekarang: {len(all_reviews)}")
    time.sleep(2)

# SAVE
df = pd.DataFrame(all_reviews)

folder = "/content/drive/MyDrive/FD_Scraping_500"
os.makedirs(folder, exist_ok=True)

save_path = f"{folder}/{product_name}_500reviews.xlsx"
df.to_excel(save_path, index=False, encoding='utf-8-sig')

print("\nSELESAI!")
print("File tersimpan:", save_path)
print("Total review:", len(df))

# DISTRIBUSI TAHUN
df['Year'] = pd.to_datetime(df['Date'], errors='coerce').dt.year
print("\nDistribusi review per tahun:")
print(df['Year'].value_counts(dropna=False))

## Skin Aqua Gel

In [None]:
# LIBRARY
import requests, time, os
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime, timedelta

# CONFIG
product_name = "SkinAquaGel"
expected_product = "UV Moisture Gel"
expected_brand = "Skin Aqua"

start_page = 1
end_page = 300
max_reviews = 500

base_url = "https://reviews.femaledaily.com/products/moisturizer/sun-protection-1/skin-aqua/uv-moisture-gel-69?cat=&cat_id=0&age_range=&skin_type=&skin_tone=&skin_undertone=&hair_texture=&hair_type=&order=newest&page={}"
headers = {"User-Agent": "Mozilla/5.0"}

all_reviews = []

# SCRAPING
for page in range(start_page, end_page + 1):

    if len(all_reviews) >= max_reviews:
        print("Stop: sudah terkumpul 500 review.")
        break

    print(f"\n================ PAGE {page} ================")

    retry = 0
    max_retry = 3
    review_cards = []

    while retry < max_retry:
        print(f"Mengambil halaman {page} (Percobaan {retry+1}/{max_retry})")
        url = base_url.format(page)
        response = requests.get(url, headers=headers)

        if response.status_code != 200:
            print("Status code gagal, coba lagi.")
            retry += 1
            time.sleep(2)
            continue

        soup = BeautifulSoup(response.text, "lxml")

        # VALIDASI PRODUK
        brand_tag = soup.select_one("h2.product-brand")
        name_tag = soup.select_one("h1.product-name")

        if brand_tag and name_tag:
            brand = brand_tag.get_text(strip=True)
            prod_name = name_tag.get_text(strip=True)

            if expected_brand not in brand or expected_product not in prod_name:
                print("Bukan halaman produk yang sesuai — SKIP")
                break

        # AMBIL CARD REVIEW
        review_cards = soup.select("div.review-card")

        if review_cards:
            print(f"Ditemukan {len(review_cards)} review pada percobaan ke-{retry+1}")
            break
        else:
            print("Tidak menemukan review. Coba ulang.")
            retry += 1
            time.sleep(2)

    if not review_cards:
        print("Tetap kosong setelah 3x percobaan. Halaman dilewati.")
        continue

    # EXTRACT REVIEW
    for card in review_cards:
        if len(all_reviews) >= max_reviews:
            break

        text_tag = card.select_one("p.text-content")
        review_text = text_tag.get_text(" ").strip() if text_tag else "N/A"

        stars = card.select("div.review-card-rating-wrapper i.icon-ic_big_star_full")
        rating = len(stars)

        date_tag = card.select_one("p.review-date")
        raw_date = date_tag.get_text(strip=True)

        parsed_date = None

        try:
            parsed_date = datetime.strptime(raw_date, "%d %b %Y")
        except:
            if "days ago" in raw_date:
                parsed_date = datetime.now() - timedelta(days=int(raw_date.split()[0]))
            elif "hours ago" in raw_date:
                parsed_date = datetime.now() - timedelta(hours=int(raw_date.split()[0]))
            elif "minutes ago" in raw_date:
                parsed_date = datetime.now() - timedelta(minutes=int(raw_date.split()[0]))
            elif "yesterday" in raw_date.lower():
                parsed_date = datetime.now() - timedelta(days=1)
            elif "today" in raw_date.lower():
                parsed_date = datetime.now()

        formatted_date = parsed_date.strftime("%Y-%m-%d") if parsed_date else "UNKNOWN"

        all_reviews.append({
            "Review Text": review_text,
            "Rating": rating,
            "Date": formatted_date
        })

    print(f"Total terkumpul sekarang: {len(all_reviews)}")
    time.sleep(2)

# SAVE
df = pd.DataFrame(all_reviews)

folder = "/content/drive/MyDrive/FD_Scraping_500"
os.makedirs(folder, exist_ok=True)

save_path = f"{folder}/{product_name}_500reviews.xlsx"
df.to_excel(save_path, index=False, encoding='utf-8-sig')

print("\nSELESAI!")
print("File tersimpan:", save_path)
print("Total review:", len(df))

# DISTRIBUSI TAHUN
df['Year'] = pd.to_datetime(df['Date'], errors='coerce').dt.year
print("\nDistribusi review per tahun:")
print(df['Year'].value_counts(dropna=False))

## L'Oreal Paris

In [None]:
# LIBRARY
import requests, time, os
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime, timedelta

# CONFIG
product_name = "LOreal"
expected_product = "UV Perfect Matte & Fresh Long UV SPF 50/ PA++++"
expected_brand = "L'Oreal Paris"

start_page = 1
end_page = 300
max_reviews = 500

base_url = "https://reviews.femaledaily.com/products/moisturizer/sun-protection-1/l-oreal-paris/uv-perfect-matte-and-fresh-long-uv-spf-50-pa?cat=&cat_id=0&age_range=&skin_type=&skin_tone=&skin_undertone=&hair_texture=&hair_type=&order=newest&page={}"
headers = {"User-Agent": "Mozilla/5.0"}

all_reviews = []

# SCRAPING
for page in range(start_page, end_page + 1):

    if len(all_reviews) >= max_reviews:
        print("Stop: sudah terkumpul 500 review.")
        break

    print(f"\n================ PAGE {page} ================")

    retry = 0
    max_retry = 3
    review_cards = []

    while retry < max_retry:
        print(f"Mengambil halaman {page} (Percobaan {retry+1}/{max_retry})")
        url = base_url.format(page)
        response = requests.get(url, headers=headers)

        if response.status_code != 200:
            print("Status code gagal, coba lagi.")
            retry += 1
            time.sleep(2)
            continue

        soup = BeautifulSoup(response.text, "lxml")

        # VALIDASI PRODUK
        brand_tag = soup.select_one("h2.product-brand")
        name_tag = soup.select_one("h1.product-name")

        if brand_tag and name_tag:
            brand = brand_tag.get_text(strip=True)
            prod_name = name_tag.get_text(strip=True)

            if expected_brand not in brand or expected_product not in prod_name:
                print("Bukan halaman produk yang sesuai — SKIP")
                break

        # AMBIL CARD REVIEW
        review_cards = soup.select("div.review-card")

        if review_cards:
            print(f"Ditemukan {len(review_cards)} review pada percobaan ke-{retry+1}")
            break
        else:
            print("Tidak menemukan review. Coba ulang.")
            retry += 1
            time.sleep(2)

    if not review_cards:
        print("Tetap kosong setelah 3x percobaan. Halaman dilewati.")
        continue

    # EXTRACT REVIEW
    for card in review_cards:
        if len(all_reviews) >= max_reviews:
            break

        text_tag = card.select_one("p.text-content")
        review_text = text_tag.get_text(" ").strip() if text_tag else "N/A"

        stars = card.select("div.review-card-rating-wrapper i.icon-ic_big_star_full")
        rating = len(stars)

        date_tag = card.select_one("p.review-date")
        raw_date = date_tag.get_text(strip=True)

        parsed_date = None

        try:
            parsed_date = datetime.strptime(raw_date, "%d %b %Y")
        except:
            if "days ago" in raw_date:
                parsed_date = datetime.now() - timedelta(days=int(raw_date.split()[0]))
            elif "hours ago" in raw_date:
                parsed_date = datetime.now() - timedelta(hours=int(raw_date.split()[0]))
            elif "minutes ago" in raw_date:
                parsed_date = datetime.now() - timedelta(minutes=int(raw_date.split()[0]))
            elif "yesterday" in raw_date.lower():
                parsed_date = datetime.now() - timedelta(days=1)
            elif "today" in raw_date.lower():
                parsed_date = datetime.now()

        formatted_date = parsed_date.strftime("%Y-%m-%d") if parsed_date else "UNKNOWN"

        all_reviews.append({
            "Review Text": review_text,
            "Rating": rating,
            "Date": formatted_date
        })

    print(f"Total terkumpul sekarang: {len(all_reviews)}")
    time.sleep(2)

# SAVE
df = pd.DataFrame(all_reviews)

folder = "/content/drive/MyDrive/FD_Scraping_500"
os.makedirs(folder, exist_ok=True)

save_path = f"{folder}/{product_name}_500reviews.xlsx"
df.to_excel(save_path, index=False, encoding='utf-8-sig')

print("\nSELESAI!")
print("File tersimpan:", save_path)
print("Total review:", len(df))

# DISTRIBUSI TAHUN
df['Year'] = pd.to_datetime(df['Date'], errors='coerce').dt.year
print("\nDistribusi review per tahun:")
print(df['Year'].value_counts(dropna=False))

## NIVEA

In [None]:
# LIBRARY
import requests, time, os
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime, timedelta

# CONFIG
product_name = "NIVEA"
expected_product = "Sun Protect & White Oil Control Serum SPF 50+"
expected_brand = "NIVEA"

start_page = 1
end_page = 300
max_reviews = 500

base_url = "https://reviews.femaledaily.com/products/moisturizer/sun-protection-1/nivea/sun-protect-and-white-oil-control-serum-spf50-pa?cat=&cat_id=0&age_range=&skin_type=&skin_tone=&skin_undertone=&hair_texture=&hair_type=&order=newest&page={}"
headers = {"User-Agent": "Mozilla/5.0"}

all_reviews = []

# SCRAPING
for page in range(start_page, end_page + 1):

    if len(all_reviews) >= max_reviews:
        print("Stop: sudah terkumpul 500 review.")
        break

    print(f"\n================ PAGE {page} ================")

    retry = 0
    max_retry = 3
    review_cards = []

    while retry < max_retry:
        print(f"Mengambil halaman {page} (Percobaan {retry+1}/{max_retry})")
        url = base_url.format(page)
        response = requests.get(url, headers=headers)

        if response.status_code != 200:
            print("Status code gagal, coba lagi.")
            retry += 1
            time.sleep(2)
            continue

        soup = BeautifulSoup(response.text, "lxml")

        # VALIDASI PRODUK
        brand_tag = soup.select_one("h2.product-brand")
        name_tag = soup.select_one("h1.product-name")

        if brand_tag and name_tag:
            brand = brand_tag.get_text(strip=True)
            prod_name = name_tag.get_text(strip=True)

            if expected_brand not in brand or expected_product not in prod_name:
                print("Bukan halaman produk yang sesuai — SKIP")
                break

        # AMBIL CARD REVIEW
        review_cards = soup.select("div.review-card")

        if review_cards:
            print(f"Ditemukan {len(review_cards)} review pada percobaan ke-{retry+1}")
            break
        else:
            print("Tidak menemukan review. Coba ulang.")
            retry += 1
            time.sleep(2)

    if not review_cards:
        print("Tetap kosong setelah 3x percobaan. Halaman dilewati.")
        continue

    # EXTRACT REVIEW
    for card in review_cards:
        if len(all_reviews) >= max_reviews:
            break

        text_tag = card.select_one("p.text-content")
        review_text = text_tag.get_text(" ").strip() if text_tag else "N/A"

        stars = card.select("div.review-card-rating-wrapper i.icon-ic_big_star_full")
        rating = len(stars)

        date_tag = card.select_one("p.review-date")
        raw_date = date_tag.get_text(strip=True)

        parsed_date = None

        try:
            parsed_date = datetime.strptime(raw_date, "%d %b %Y")
        except:
            if "days ago" in raw_date:
                parsed_date = datetime.now() - timedelta(days=int(raw_date.split()[0]))
            elif "hours ago" in raw_date:
                parsed_date = datetime.now() - timedelta(hours=int(raw_date.split()[0]))
            elif "minutes ago" in raw_date:
                parsed_date = datetime.now() - timedelta(minutes=int(raw_date.split()[0]))
            elif "yesterday" in raw_date.lower():
                parsed_date = datetime.now() - timedelta(days=1)
            elif "today" in raw_date.lower():
                parsed_date = datetime.now()

        formatted_date = parsed_date.strftime("%Y-%m-%d") if parsed_date else "UNKNOWN"

        all_reviews.append({
            "Review Text": review_text,
            "Rating": rating,
            "Date": formatted_date
        })

    print(f"Total terkumpul sekarang: {len(all_reviews)}")
    time.sleep(2)

# SAVE
df = pd.DataFrame(all_reviews)

folder = "/content/drive/MyDrive/FD_Scraping_500"
os.makedirs(folder, exist_ok=True)

save_path = f"{folder}/{product_name}_500reviews.xlsx"
df.to_excel(save_path, index=False, encoding='utf-8-sig')

print("\nSELESAI!")
print("File tersimpan:", save_path)
print("Total review:", len(df))

# DISTRIBUSI TAHUN
df['Year'] = pd.to_datetime(df['Date'], errors='coerce').dt.year
print("\nDistribusi review per tahun:")
print(df['Year'].value_counts(dropna=False))

## Carasun

In [None]:
# LIBRARY
import requests, time, os
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime, timedelta

# CONFIG
product_name = "Carasun"
expected_product = "Solar Smart UV Protector SPF45 PA++++"
expected_brand = "Carasun"

start_page = 1
end_page = 300
max_reviews = 500

base_url = "https://reviews.femaledaily.com/products/moisturizer/sun-protection-1/carasun/carasun-solar-smart-uv-protector-spf45-pa?cat=&cat_id=0&age_range=&skin_type=&skin_tone=&skin_undertone=&hair_texture=&hair_type=&order=newest&page={}"
headers = {"User-Agent": "Mozilla/5.0"}

all_reviews = []
# SCRAPING
for page in range(start_page, end_page + 1):

    if len(all_reviews) >= max_reviews:
        print("Stop: sudah terkumpul 500 review.")
        break

    print(f"\n================ PAGE {page} ================")

    retry = 0
    max_retry = 3
    review_cards = []

    while retry < max_retry:
        print(f"Mengambil halaman {page} (Percobaan {retry+1}/{max_retry})")
        url = base_url.format(page)
        response = requests.get(url, headers=headers)

        if response.status_code != 200:
            print("Status code gagal, coba lagi.")
            retry += 1
            time.sleep(2)
            continue

        soup = BeautifulSoup(response.text, "lxml")

        # VALIDASI PRODUK
        brand_tag = soup.select_one("h2.product-brand")
        name_tag = soup.select_one("h1.product-name")

        if brand_tag and name_tag:
            brand = brand_tag.get_text(strip=True)
            prod_name = name_tag.get_text(strip=True)

            if expected_brand not in brand or expected_product not in prod_name:
                print("Bukan halaman produk yang sesuai — SKIP")
                break

        # AMBIL CARD REVIEW
        review_cards = soup.select("div.review-card")

        if review_cards:
            print(f"Ditemukan {len(review_cards)} review pada percobaan ke-{retry+1}")
            break
        else:
            print("Tidak menemukan review. Coba ulang.")
            retry += 1
            time.sleep(2)

    if not review_cards:
        print("Tetap kosong setelah 3x percobaan. Halaman dilewati.")
        continue

    # EXTRACT REVIEW
    for card in review_cards:
        if len(all_reviews) >= max_reviews:
            break

        text_tag = card.select_one("p.text-content")
        review_text = text_tag.get_text(" ").strip() if text_tag else "N/A"

        stars = card.select("div.review-card-rating-wrapper i.icon-ic_big_star_full")
        rating = len(stars)

        date_tag = card.select_one("p.review-date")
        raw_date = date_tag.get_text(strip=True)

        parsed_date = None

        try:
            parsed_date = datetime.strptime(raw_date, "%d %b %Y")
        except:
            if "days ago" in raw_date:
                parsed_date = datetime.now() - timedelta(days=int(raw_date.split()[0]))
            elif "hours ago" in raw_date:
                parsed_date = datetime.now() - timedelta(hours=int(raw_date.split()[0]))
            elif "minutes ago" in raw_date:
                parsed_date = datetime.now() - timedelta(minutes=int(raw_date.split()[0]))
            elif "yesterday" in raw_date.lower():
                parsed_date = datetime.now() - timedelta(days=1)
            elif "today" in raw_date.lower():
                parsed_date = datetime.now()

        formatted_date = parsed_date.strftime("%Y-%m-%d") if parsed_date else "UNKNOWN"

        all_reviews.append({
            "Review Text": review_text,
            "Rating": rating,
            "Date": formatted_date
        })

    print(f"Total terkumpul sekarang: {len(all_reviews)}")
    time.sleep(2)

# SAVE
df = pd.DataFrame(all_reviews)

folder = "/content/drive/MyDrive/FD_Scraping_500"
os.makedirs(folder, exist_ok=True)

save_path = f"{folder}/{product_name}_500reviews.xlsx"
df.to_excel(save_path, index=False, encoding='utf-8-sig')

print("\nSELESAI!")
print("File tersimpan:", save_path)
print("Total review:", len(df))

# DISTRIBUSI TAHUN
df['Year'] = pd.to_datetime(df['Date'], errors='coerce').dt.year
print("\nDistribusi review per tahun:")
print(df['Year'].value_counts(dropna=False))

## Wardah

In [None]:
# LIBRARY
import requests, time, os
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime, timedelta

# CONFIG
product_name = "Wardah"
expected_product = "UV Shield Airy Smooth Sunscreen Serum SPF 50 PA++++"
expected_brand = "Wardah"

start_page = 1
end_page = 300
max_reviews = 500

base_url = "https://reviews.femaledaily.com/products/moisturizer/sun-protection-1/wardah/uv-shield-airy-smooth-sunscreen-serum-spf-50-pa-1?cat=&cat_id=0&age_range=&skin_type=&skin_tone=&skin_undertone=&hair_texture=&hair_type=&order=newest&page={}"
headers = {"User-Agent": "Mozilla/5.0"}

all_reviews = []

# SCRAPING
for page in range(start_page, end_page + 1):

    if len(all_reviews) >= max_reviews:
        print("Stop: sudah terkumpul 500 review.")
        break

    print(f"\n================ PAGE {page} ================")

    retry = 0
    max_retry = 3
    review_cards = []

    while retry < max_retry:
        print(f"Mengambil halaman {page} (Percobaan {retry+1}/{max_retry})")
        url = base_url.format(page)
        response = requests.get(url, headers=headers)

        if response.status_code != 200:
            print("Status code gagal, coba lagi.")
            retry += 1
            time.sleep(2)
            continue

        soup = BeautifulSoup(response.text, "lxml")

        # VALIDASI PRODUK
        brand_tag = soup.select_one("h2.product-brand")
        name_tag = soup.select_one("h1.product-name")

        if brand_tag and name_tag:
            brand = brand_tag.get_text(strip=True)
            prod_name = name_tag.get_text(strip=True)

            if expected_brand not in brand or expected_product not in prod_name:
                print("Bukan halaman produk yang sesuai — SKIP")
                break

        # AMBIL CARD REVIEW
        review_cards = soup.select("div.review-card")

        if review_cards:
            print(f"Ditemukan {len(review_cards)} review pada percobaan ke-{retry+1}")
            break
        else:
            print("Tidak menemukan review. Coba ulang.")
            retry += 1
            time.sleep(2)

    if not review_cards:
        print("Tetap kosong setelah 3x percobaan. Halaman dilewati.")
        continue

    # EXTRACT REVIEW
    for card in review_cards:
        if len(all_reviews) >= max_reviews:
            break

        text_tag = card.select_one("p.text-content")
        review_text = text_tag.get_text(" ").strip() if text_tag else "N/A"

        stars = card.select("div.review-card-rating-wrapper i.icon-ic_big_star_full")
        rating = len(stars)

        date_tag = card.select_one("p.review-date")
        raw_date = date_tag.get_text(strip=True)

        parsed_date = None

        try:
            parsed_date = datetime.strptime(raw_date, "%d %b %Y")
        except:
            if "days ago" in raw_date:
                parsed_date = datetime.now() - timedelta(days=int(raw_date.split()[0]))
            elif "hours ago" in raw_date:
                parsed_date = datetime.now() - timedelta(hours=int(raw_date.split()[0]))
            elif "minutes ago" in raw_date:
                parsed_date = datetime.now() - timedelta(minutes=int(raw_date.split()[0]))
            elif "yesterday" in raw_date.lower():
                parsed_date = datetime.now() - timedelta(days=1)
            elif "today" in raw_date.lower():
                parsed_date = datetime.now()

        formatted_date = parsed_date.strftime("%Y-%m-%d") if parsed_date else "UNKNOWN"

        all_reviews.append({
            "Review Text": review_text,
            "Rating": rating,
            "Date": formatted_date
        })

    print(f"Total terkumpul sekarang: {len(all_reviews)}")
    time.sleep(2)

# SAVE
df = pd.DataFrame(all_reviews)

folder = "/content/drive/MyDrive/FD_Scraping_500"
os.makedirs(folder, exist_ok=True)

save_path = f"{folder}/{product_name}_500reviews.xlsx"
df.to_excel(save_path, index=False, encoding='utf-8-sig')

print("\nSELESAI!")
print("File tersimpan:", save_path)
print("Total review:", len(df))

# DISTRIBUSI TAHUN
df['Year'] = pd.to_datetime(df['Date'], errors='coerce').dt.year
print("\nDistribusi review per tahun:")
print(df['Year'].value_counts(dropna=False))

# Pre-processing

## Gabungkan Data

In [None]:
import pandas as pd
import glob
import os

# CONFIG
input_folder = "/content/drive/MyDrive/Skripsi/FD_Scraping_500"
output_folder = "/content/drive/MyDrive/Skripsi/Preprocessing"
output_file = f"{output_folder}/AllProducts_500reviews_raw.xlsx"

os.makedirs(output_folder, exist_ok=True)

# Mapping huruf ID berdasarkan produk
product_code = {
    "Emina": "A",
    "AzarineCosmetic": "B",
    "SkinAquaMilk": "C",
    "Biore": "D",
    "NPURE": "E",
    "SkinAquaGel": "F",
    "LOrealParis": "G",
    "NIVEA": "H",
    "Carasun": "I",
    "Wardah": "J"
}

# LOAD & COMBINE EXCEL
excel_files = glob.glob(f"{input_folder}/*.xlsx")
all_data = []

print("File ditemukan untuk digabung:")
for file in excel_files:
    print(" -", file)

    # Ambil nama produk dari nama file
    base_name = os.path.basename(file).replace("_500reviews.xlsx", "")

    # Baca Excel
    df = pd.read_excel(file)

    # Tambahkan kolom produk
    df["Product"] = base_name

    # Tentukan kode huruf produk
    code = product_code.get(base_name, "X")  # fallback X kalau tak ditemukan

    # Buat ReviewID: A001, A002, dst
    df["ReviewID"] = [f"{code}{str(i+1).zfill(3)}" for i in range(len(df))]

    all_data.append(df)

# Gabungkan semua
df_all = pd.concat(all_data, ignore_index=True)

print("\nTotal review gabungan:", len(df_all))

# SAVE TO DRIVE
df_all.to_excel(output_file, index=False, encoding="utf-8-sig")

print("\nSELESAI! DATA TERSIMPAN")
print("Lokasi file:", output_file)

## Case Folding & Cleaning Data

In [None]:
import re

def clean_text(text):
    if pd.isna(text):
        return ""
    text = str(text).lower() # Case Folding

    text = re.sub(r"http\S+|www\S+|bit.ly\S+", "", text)        # hapus URL
    text = re.sub(r"[^\w\s,.!?]", " ", text)                   # hapus emoji & symbols
    text = re.sub(r"\s+", " ", text).strip()                   # normalisasi spasi

    return text

df_all["Clean Review"] = df_all["Review Text"].apply(clean_text)
df_all = df_all.drop_duplicates(subset=["Clean Review"], keep="first")

output_folder = "/content/drive/MyDrive/Skripsi/Preprocessing"
output_file = f"{output_folder}/DataClean.xlsx"

df_all.to_excel(output_file, index=False, encoding="utf-8-sig")

print("\nSELESAI! DATA TERSIMPAN")
print("Lokasi file:", output_file)

## Sentence Splitting

In [None]:
import stanza
import pandas as pd
from tqdm.auto import tqdm

tqdm.pandas()

# Download Stanza Indonesian model
stanza.download("id")

# Load Stanza pipeline
nlp = stanza.Pipeline(lang="id", processors="tokenize")

# Load clean dataset
file_path = "/content/drive/MyDrive/Skripsi/Preprocessing/DataClean.xlsx"
df = pd.read_excel(file_path)

# Function for sentence splitting
def stanza_sentence_split(text):
    if pd.isna(text):
        return []
    doc = nlp(text)
    return [sentence.text.strip() for sentence in doc.sentences]

print("Memulai sentence splitting (Stanza).")

# Apply sentence splitting
df["Sentences"] = df["Cleaned Review"].progress_apply(stanza_sentence_split)

# Expand sentences into multiple rows
df_sentences = df.explode("Sentences").reset_index(drop=True)

# Rename column
df_sentences = df_sentences.rename(columns={"Sentences": "Sentence"})

# Drop empty entries
df_sentences = df_sentences[df_sentences["Sentence"].str.strip().astype(bool)]

# Export
output_path = "/content/drive/MyDrive/Skripsi/Preprocessing/DataSentences.xlsx"
df_sentences.to_excel(output_path, index=False)

print("\nSentence Splitting selesai.")
print("Output file:", output_path)
print("Total kalimat:", len(df_sentences))
print("Jumlah kalimat per produk:")
print(df_sentences["Product"].value_counts())

## Normalisasi

In [None]:
import pandas as pd
from tqdm import tqdm
from indo_normalizer import Normalizer

# Load data sentence split
input_path = "/content/drive/MyDrive/Skripsi/Preprocessing/DataSentences.xlsx"
df = pd.read_excel(input_path)

normalizer = Normalizer()
tqdm.pandas(desc="Normalizing text")

def normalize_text(text):
    if pd.isna(text):
        return ""
    try:
        return normalizer.normalize_text(str(text).lower())
    except:
        return str(text).lower()

df["Normalized_Sentence"] = df["Sentence"].progress_apply(normalize_text)

# Save output ke Drive
output_path = "/content/drive/MyDrive/Skripsi/Preprocessing/DataNormalized.xlsx"
df.to_excel(output_path, index=False)

print("Normalisasi selesai.")
print("Output file:", output_path)

# Ekstraksi Aspek

In [None]:
import stanza

stanza.download("id")
stanza.download("en")
nlp_id = stanza.Pipeline("id", processors="tokenize,pos,lemma,depparse")
nlp_en = stanza.Pipeline("en", processors="tokenize,pos,lemma,depparse")

def extract_single_word_nouns(doc):
    aspects = []
    for sent in doc.sentences:
        for w in sent.words:
            if w.upos == "NOUN":
                aspects.append(w.lemma.lower())   # lemma biar seragam
    return aspects

# Load data
input_path = "/content/drive/MyDrive/Skripsi/Preprocessing/DataNormalized.xlsx"
df = pd.read_excel(input_path)

aspect_rows = []

for i, row in tqdm(df.iterrows(), total=len(df)):
    text = str(row["Normalized_Sentence"])

    lang = detect_lang_simple(text)
    doc = nlp_id(text) if lang == "id" else nlp_en(text)

    aspects = extract_single_word_nouns(doc)

    for asp in aspects:
        aspect_rows.append({
            "SentenceID": row["SentenceID"],
            "Product": row["Product"],
            "Text": text,
            "Aspect": asp
        })

df_aspect = pd.DataFrame(aspect_rows)

# Filtering aspek
# Hapus aspek yang frekuensinya kurang dari 10
min_freq = 10
freq = df_aspect["Aspect"].value_counts()
valid_aspects = freq[freq >= min_freq].index.tolist()
df_filtered = df_aspect[df_aspect["Aspect"].isin(valid_aspects)]

# Filtering manual
aspects_to_remove = [
    "sih", "pokok", "banget", "gitu", "aqua", "soal", "emina", "wardah",
    "pas", "azarine", "npure", "cinta", "benar", "enggak", "kayak", "kata",
    "sayang", "carasun", "loreal", "nih", "gue", "nivea", "plus", "bagi", "aktifitas",
    "bawah", "ada", "biore", "guys", "segi", "macam", "nya", "terus", "sumpah", "enak",
    "saat", "kalo", "dan", "so", "mana", "nama", "gini", "poll", "hal", "gara", "giat",
    "guy", "ini", "love", "kemana", "tabur", "kurang", "huhu", "pol", "iseng", "thanks",
    "buat", "sebul", "oke", "pasti", "dll", "hehe", "mantap", "suka", "bikin", "tea",
    "masa", "far", "wkwk", "tuh", "menurutku", "lebih", "poin", "deh", "milik", "rada",
    "pun", "tadi", "drama", "secinta", "female", "dong", "eh", "juga", "overall", "but",
    "course", "saran", "apa", "inti", "agak", "atas", "bintang", "umumnya", "cukup", "haha",
    "sunscreen", "sinar", "hasil", "matahari", "bedak", "skincare", "make", "teman", "lindung",
    "jenis", "up", "moisturizer", "aktivitas", "base", "orang", "masalah", "outdoor", "review",
    "ruang", "papar", "sekarang", "remaja", "area", "guna", "sun", "bekas", "serum", "rumah",
    "sma", "kilang", "minus", "pelajar", "primer", "awan", "anak", "no", "pemula", "siang",
    "tempat", "moisture", "menit", "foundation", "kuliah", "semi", "smp", "akhir", "hati",
    "kondisi", "lama", "layer", "sekolah", "olahraga", "luar", "indoor", "pagi", "cuaca",
    "seharian", "pengalaman", "kalangan", "ibu", "traveling", "kesan", "tabir", "surya",
    "tas", "masker", "kuning", "mahasiswa", "wudhu", "butuh", "daerah", "preparation",
    "panas", "auto", "cuci", "cushion", "pouch", "gampang", "cocok", "toner", "juara",
    "cakey", "crack", "tua", "hype", "event", "jalan", "manfaat", "rawat", "beauty", "pantai",
    "sunblock", "untung", "rating", "lapang", "pink", "step", "complexion", "tambah", "nilai",
    "ekstra", "rangkai", "loose", "malam", "sekali", "merah", "sisa", "mama", "cowok", "skip",
    "sehat", "terik", "list", "adek", "kakak", "lapis", "kena", "waktu", "mini", "travelling",
    "white", "skin", "air", "daya", "hydrasoothe", "beat", "dna", "shield"
]

df = df_filtered[~df_filtered["Aspect"].isin(aspects_to_remove)]

# Simpan data dengan aspek
output_aspect = "/content/drive/MyDrive/Skripsi/Aspect_Extraction/Data_Aspek.xlsx"
df.to_excel(output_aspect, index=False)

# Hitung frekuensi aspek
aspect_freq = (
    df["Aspect"]
    .value_counts()
    .reset_index()
    .rename(columns={"index": "Aspect", "Aspect": "Frequency"})
)

aspect_freq

# Simpan frekuensi aspek untuk proses pengelompokan ke fitur dan sub-fitur
output_freq = "/content/drive/MyDrive/Skripsi/Aspect_Extraction/Frekuensi_Aspek.xlsx"
aspect_freq.to_excel(output_freq, index=False)

# Mapping hasil pengelompokan
# Load file Excel frekuensi + kelompok
df_map = pd.read_excel("/content/drive/MyDrive/Skripsi/Aspect_Extraction/Frekuensi_Aspek.xlsx")

# Rename 'Frequency' ke 'Aspect'
if "Frequency" in df_map.columns:
    df_map = df_map.rename(columns={"Frequency": "Aspect"})

# Memastikan kolom 'Aspect' lowercase dan stripped
df["Aspect"] = df["Aspect"].str.lower().str.strip()
df_map["Aspect"] = df_map["Aspect"].str.lower().str.strip()

# Merge berdasarkan 'Aspect'
df_joined = df.merge(
    df_map[["Aspect", "Jenis_Fitur", "Sub_Fitur"]],
    on="Aspect",
    how="left"
)

# Save
df_joined.to_excel("/content/drive/MyDrive/Skripsi/Aspect_Extraction/Aspek_Final.xlsx", index=False)

# Klasifikasi Sentimen

## IndoBERT

In [None]:
import torch
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from tqdm import tqdm

# Load data
df = pd.read_excel("/content/drive/MyDrive/Skripsi/Aspect_Extraction/Aspek_Final.xlsx")

# Load model
model_dir = "/content/drive/MyDrive/Skripsi/Sentimen/Model_IndoBERT_ABSA_Finetuned"
tokenizer = AutoTokenizer.from_pretrained(model_dir)
model = AutoModelForSequenceClassification.from_pretrained(model_dir)
model.eval()

results = []

for _, row in tqdm(df.iterrows(), total=len(df)):
    text_input = f"{row['Text']} [SEP] {row['Aspect']}"

    inputs = tokenizer(
        text_input,
        return_tensors="pt",
        truncation=True,
        max_length=128
    )

    with torch.no_grad():
        outputs = model(**inputs)

    logits = outputs.logits.squeeze().numpy()
    exp_logits = np.exp(logits)
    softmax = exp_logits / exp_logits.sum()

    results.append({
        "SentenceID": row["SentenceID"],
        "Product": row["Product"],
        "Text": row["Text"],
        "Aspect": row["Aspect"],
        "Jenis_Fitur": row["Jenis_Fitur"],
        "Sub_Fitur": row["Sub_Fitur"],
        "Logits_Negatif": logits[0],
        "Logits_Netral": logits[1],
        "Logits_Positif": logits[2],
        "Prob_Negatif": softmax[0],
        "Prob_Netral": softmax[1],
        "Prob_Positif": softmax[2],
    })

df_sentimen = pd.DataFrame(results)

# Pelabelan sentimen
sentiment_map = {
    0: "Negatif",
    1: "Netral",
    2: "Positif"
}

df_sentimen["Sentimen"] = (
    df[["Prob_Negatif", "Prob_Netral", "Prob_Positif"]]
    .values
    .argmax(axis=1)
)

df_sentimen["Sentimen"] = df_sentimen["Sentimen"].map(sentiment_map)

# Atur ulang urutan kolom
df_fixed = df_sentimen[[
    "SentenceID",
    "Product",
    "Text",
    "Aspect",
    "Jenis_Fitur",
    "Sub_Fitur",
    "Sentimen",
    "Prob_Negatif",
    "Prob_Netral",
    "Prob_Positif",
    "Logits_Negatif",
    "Logits_Netral",
    "Logits_Positif"
]]

df_fixed.to_excel("/content/drive/MyDrive/Skripsi/Sentimen/Data_FINAL.xlsx", index=False)

## Pivot Berdasarkan Distribusi Sentimen

In [None]:
# Jenis Fitur
pivot_jenis = (
    df_fixed
    .groupby(["Jenis_Fitur", "Sentimen"])
    .size()
    .unstack(fill_value=0)
)

pivot_jenis["Total"] = pivot_jenis.sum(axis=1)

# Sub-Fitur
pivot_sub = (
    df_fixed
    .groupby(["Sub_Fitur", "Sentimen"])
    .size()
    .unstack(fill_value=0)
)

pivot_sub["Total"] = pivot_sub.sum(axis=1)

# Produk
pivot_produk = (
    df_fixed
    .groupby(["Product", "Sentimen"])
    .size()
    .unstack(fill_value=0)
)

pivot_produk["Total"] = pivot_produk.sum(axis=1)

# Produk x Jenis Fitur
pivot_produk_jenis = (
    df_fixed
    .groupby(["Product", "Jenis_Fitur", "Sentimen"])
    .size()
    .unstack(fill_value=0)
)

pivot_produk_jenis["Total"] = pivot_produk_jenis.sum(axis=1)

# Produk x Sub-Fitur
pivot_produk_sub = (
    df_fixed
    .groupby(["Product", "Sub_Fitur", "Sentimen"])
    .size()
    .unstack(fill_value=0)
)

pivot_produk_sub["Total"] = pivot_produk_sub.sum(axis=1)

# Jenis Fitur x Sub-Fitur
pivot_jenis_sub = (
    df_fixed
    .groupby(["Jenis_Fitur", "Sub_Fitur", "Sentimen"])
    .size()
    .unstack(fill_value=0)
)

pivot_jenis_sub["Total"] = pivot_jenis_sub.sum(axis=1)

# Lengkap
pivot_lengkap = (
    df_fixed
    .groupby(["Product","Jenis_Fitur", "Sub_Fitur", "Sentimen"])
    .size()
    .unstack(fill_value=0)
)

pivot_lengkap["Total"] = pivot_lengkap.sum(axis=1)

# Simpan ke Excel
with pd.ExcelWriter("/content/drive/MyDrive/Skripsi/Pivot_Sentimen.xlsx") as writer:
    pivot_jenis.to_excel(writer, sheet_name="Jenis_Fitur")
    pivot_sub.to_excel(writer, sheet_name="Sub_Fitur")
    pivot_produk.to_excel(writer, sheet_name="Produk")
    pivot_produk_jenis.to_excel(writer, sheet_name="ProdukxJenis")
    pivot_produk_sub.to_excel(writer, sheet_name="ProdukxSub")
    pivot_jenis_sub.to_excel(writer, sheet_name="JenisxSub")
    pivot_lengkap.to_excel(writer, sheet_name="Lengkap")

## Pivot Berdasarkan Rata-rata Skor Probabilitas Sentimen Positif

In [None]:
# Jenis Fitur
avg_pos_jenis = (
    df_fixed
    .groupby("Jenis_Fitur")["Prob_Positif"]
    .mean()
    .reset_index()
    .sort_values("Prob_Positif", ascending=False)
)

# Sub-Fitur
avg_pos_sub = (
    df_fixed
    .groupby("Sub_Fitur")["Prob_Positif"]
    .mean()
    .reset_index()
    .sort_values("Prob_Positif", ascending=False)
)


# Produk x Jenis Fitur
avg_pos_jenis_produk = (
    df_fixed
    .groupby(["Product", "Jenis_Fitur"])["Prob_Positif"]
    .mean()
    .reset_index()
    .sort_values(["Product", "Prob_Positif"], ascending=[True, False])
)

# Produk x Sub-Fitur
avg_pos_sub_produk = (
    df_fixed
    .groupby(["Product", "Sub_Fitur"])["Prob_Positif"]
    .mean()
    .reset_index()
    .sort_values(["Product", "Prob_Positif"], ascending=[True, False])
)

# Lengkap
avg_lengkap = (
    df_fixed
    .groupby(["Product", "Jenis_Fitur", "Sub_Fitur"])["Prob_Positif"]
    .mean()
    .reset_index()
    .sort_values(["Product", "Prob_Positif"], ascending=[True, False])
)

with pd.ExcelWriter("/content/drive/MyDrive/Skripsi/RataRata_Sentimen_Positif.xlsx") as writer:
    avg_pos_jenis.to_excel(writer, sheet_name="Jenis_Fitur", index=False)
    avg_pos_sub.to_excel(writer, sheet_name="Sub_Fitur", index=False)
    avg_pos_jenis_produk.to_excel(writer, sheet_name="Jenis×Produk", index=False)
    avg_pos_sub_produk.to_excel(writer, sheet_name="Sub×Produk", index=False)
    avg_lengkap.to_excel(writer, sheet_name="Lengkap", index=False)


# Metode Balas

In [None]:
import heapq
import math

class BalasBestFirst:
    def __init__(self, P, p, k, B):
        self.P = P
        self.p = p
        self.k = k
        self.B = B
        self.n = len(P)
        self.target_y = self.n - k
        self.target_budget = sum(p) - B

        # Urutkan variabel berdasarkan koefisien objektif (menaik) untuk minimisasi
        self.idx_sorted_P = sorted(range(self.n), key=lambda i: P[i])
        # self.order adalah indeks 1-based dari variabel y (sesuai P_sorted)
        self.order = [i + 1 for i in self.idx_sorted_P]

        self.best_value = math.inf # Untuk minimisasi Z'
        self.best_solution_y = None # Solusi optimal dalam variabel y

        self.node_id = -1 # Ubah ke -1 agar root node bernilai 0
        self.counter = 0 # Untuk tie-breaker di heapq
        self.nodes = []

    # =============================
    # CEK PELANGGARAN FIXED SECARA LANGSUNG (STRICTER PARTIAL FEASIBILITY)
    # Ini memeriksa apakah bagian yang sudah fixed_y sudah melanggar salah satu kendala
    # yang tidak mungkin lagi diperbaiki, bahkan dengan variabel bebas.
    # Ini menerapkan 'Σy_i <= target_y' dan 'Σp_i*y_i >= target_budget' secara ketat pada bagian fixed.
    # =============================
    def check_fixed_violation(self, fixed_y):
        current_y_ones = sum(1 for val in fixed_y.values() if val == 1)

        # 1. Kendala: Σy_i <= target_y (dari Σy_i = target_y)
        # Jika sudah terlalu banyak y=1 yang fixed, maka sudah tidak feasible.
        if current_y_ones > self.target_y:
            return True # Melanggar: terlalu banyak y=1, tidak bisa memenuhi Σy_i = target_y

        # 2. Kendala: Σp_i*y_i >= target_budget
        # Periksa apakah bahkan dengan semua variabel bebas diset 1, kita masih tidak mencapai target budget
        current_py_sum = sum(self.p[i-1] * fixed_y[i] for i in fixed_y if fixed_y[i] == 1)
        free_indices_y = [i for i in range(1, self.n + 1) if i not in fixed_y]

        # Maksimum p*y yang bisa ditambahkan dari variabel bebas
        max_possible_additional_py = sum(self.p[i-1] for i in free_indices_y)

        if (current_py_sum + max_possible_additional_py) < self.target_budget:
            return True # Melanggar: tidak mungkin memenuhi Σp_i*y_i >= target_budget

        return False # Belum ada pelanggaran langsung dari bagian fixed

    # =============================
    # CEK KEMUNGKINAN PENYELESAIAN (UNTUK SEMUA KENDALA)
    # Ini memeriksa apakah solusi parsial *masih mungkin* diperluas menjadi solusi feasible.
    # Menerapkan 'Σy_i >= target_y' dan 'Σy_i <= target_y' serta 'Σp_i*y_i >= target_budget' pada skenario optimis.
    # =============================
    def is_possible_completion(self, fixed_y):
        current_y_ones = sum(1 for val in fixed_y.values() if val == 1)
        free_vars_count = self.n - len(fixed_y)

        # --- Bagian 1: Memastikan Σy_i = target_y masih mungkin ---
        # Ini dipecah menjadi dua pertidaksamaan:

        # a) Kendala: Σy_i >= target_y
        # Maksimum 1s yang bisa didapat (fixed 1s + semua free 1s)
        max_possible_total_y_ones = current_y_ones + free_vars_count
        if max_possible_total_y_ones < self.target_y:
            return False # Tidak mungkin mencapai target_y (terlalu sedikit 1s)

        # b) Kendala: Σy_i <= target_y
        # Minimum 1s yang bisa didapat (fixed 1s + semua free 0s)
        min_possible_total_y_ones = current_y_ones
        if min_possible_total_y_ones > self.target_y:
            return False # Sudah terlalu banyak 1s, tidak mungkin target_y tercapai

        # --- Bagian 2: Memastikan Σp_i*y_i >= target_budget masih mungkin ---
        fixed_py_sum = sum(self.p[i-1] * fixed_y[i] for i in fixed_y if fixed_y[i] == 1)
        free_indices_y = [i for i in range(1, self.n+1) if i not in fixed_y]

        # Untuk memaksimalkan Σp_i*y_i dari variabel bebas, kita harus memilih yang p_i terbesar.
        free_items_p_sorted_desc = []
        for y_idx in free_indices_y:
            free_items_p_sorted_desc.append((self.p[y_idx-1], y_idx))
        free_items_p_sorted_desc.sort(key=lambda x: x[0], reverse=True)

        # Jumlah y=1 yang harus diambil dari variabel bebas untuk mencapai target_y
        # Ini adalah jumlah optimis untuk mencoba memenuhi kendala Σy_i = target_y
        # dan kemudian memeriksa kendala budget.
        ones_needed_from_free_optimistic = self.target_y - current_y_ones

        # Hitung kontribusi p*y maksimum dari variabel bebas
        # Ambil sejumlah 'ones_needed_from_free_optimistic' variabel dengan p_i terbesar.
        max_additional_py_from_free = sum(item[0] for item in free_items_p_sorted_desc[:max(0, ones_needed_from_free_optimistic)])

        if (fixed_py_sum + max_additional_py_from_free) < self.target_budget:
            return False # Tidak mungkin memenuhi kendala budget

        return True

    # =============================
    # LOWER BOUND (untuk tujuan minimisasi W')
    # =============================
    def lower_bound(self, fixed_y):
        # Bound = sum(P_i * y_i) untuk variabel yang sudah fixed_y
        z_bound = sum(self.P[i-1] * fixed_y[i] for i in fixed_y if fixed_y[i] == 1)

        # Jumlah y=1 yang masih perlu ditambahkan (dari variabel bebas) untuk mencapai target_y
        remaining_ones_to_add = self.target_y - sum(fixed_y.values())

        # Tambahkan P_i dari variabel bebas yang tersisa, pilih yang P_i terkecil
        # self.order sudah dalam urutan P_i terkecil
        for y_idx in self.order:
            if y_idx not in fixed_y and remaining_ones_to_add > 0:
                z_bound += self.P[y_idx-1]
                remaining_ones_to_add -= 1

        return z_bound

    # =============================
    # ALGORITMA BALAS (BEST-FIRST)
    # =============================
    def solve(self):
        pq = [] # Priority queue untuk best-first search
        self.counter = 0 # Tie-breaker untuk heapq

        # Node awal: {bound, counter, fixed_y, parent_node_id, decision_string}
        self.counter += 1
        heapq.heappush(pq, (0, self.counter, {}, None, "root"))

        while pq:
            lb, _, fixed_y, parent_node_id, decision_str = heapq.heappop(pq)
            self.node_id += 1
            current_node_id = self.node_id

            status = "EXPANDED"
            reason = ""

            # Pruning berdasarkan incumbent solution (best_value yang ditemukan sejauh ini)
            if lb >= self.best_value:
                status, reason = "PRUNED", "BOUND"
            # Pruning berdasarkan pelanggaran fixed_y secara langsung
            elif self.check_fixed_violation(fixed_y):
                status, reason = "PRUNED", "INFEASIBLE PARTIAL (VIOLATION)"
            # Pruning berdasarkan kemungkinan penyelesaian (possible completion)
            elif not self.is_possible_completion(fixed_y):
                status, reason = "PRUNED", "NOT POSSIBLE TO COMPLETE"

            self.nodes.append({
                "Node": current_node_id,
                "Parent": parent_node_id,
                "Decision": decision_str,
                "Fixed_Y": fixed_y.copy(),
                "LB": round(lb, 6),
                "Status": status,
                "Reason": reason
            })

            if status != "EXPANDED":
                continue

            # Jika semua variabel sudah ditetapkan (solusi lengkap)
            if len(fixed_y) == self.n:
                # Cek kelayakan penuh untuk solusi ini (seharusnya sudah di-filter oleh pruning)
                total_y_ones = sum(fixed_y.values())
                total_py_sum = sum(self.p[i-1] * fixed_y[i] for i in fixed_y if fixed_y[i] == 1)

                if total_y_ones == self.target_y and total_py_sum >= self.target_budget:
                    # Solusi feasible ditemukan, update incumbent jika lebih baik
                    if lb < self.best_value:
                        self.best_value = lb
                        self.best_solution_y = fixed_y.copy()
                continue

            # Branching: Pilih variabel y berikutnya sesuai self.order
            next_y_var_idx = -1
            for y_idx in self.order:
                if y_idx not in fixed_y:
                    next_y_var_idx = y_idx
                    break

            if next_y_var_idx == -1: # Seharusnya tidak terjadi jika len(fixed_y) != self.n
                continue

            # Buat 2 cabang: y = 1 dan y = 0
            for val in [1, 0]:
                new_fixed_y = fixed_y.copy()
                new_fixed_y[next_y_var_idx] = val
                new_lb = self.lower_bound(new_fixed_y)

                self.counter += 1
                heapq.heappush(
                    pq,
                    (new_lb, self.counter, new_fixed_y, current_node_id, f"y{next_y_var_idx}={val}")
                )

        print("\n=== FINAL RESULT (BEST-FIRST) ===")
        print("Total nodes:", self.node_id + 1)
        print("Minimum (Z'):", round(self.best_value, 6))
        print("Optimal y:", self.best_solution_y)

        # Konversi solusi y_optimal kembali ke x_optimal dan hitung Z asli
        if self.best_solution_y:
            optimal_x = [0] * self.n # Inisialisasi x dengan 0
            selected_x_indices_1based = []
            total_original_P = 0.0
            total_original_p = 0.0

            for y_idx_1based, y_val in self.best_solution_y.items():
                original_P_idx_0based = y_idx_1based - 1

                # x_i = 1 - y_i
                x_val = 1 - y_val

                # Simpan di array x dengan indeks asli
                optimal_x[original_P_idx_0based] = x_val

                if x_val == 1:
                    selected_x_indices_1based.append(original_P_idx_0based + 1)
                    total_original_P += self.P[original_P_idx_0based]
                    total_original_p += self.p[original_P_idx_0based]

            print("Optimal x:", optimal_x)
            print("Variabel yang terpilih (Original Index):", selected_x_indices_1based)
            print("Maksimum Fungsi Objektif (Z)):", round(total_original_P, 6))
            print("Total anggaran:", round(total_original_p, 1))
            print(f"\nVerifikasi: ∑x_i = {sum(optimal_x)} (harus {self.k})")
            print(f"Verifikasi: ∑p_i*x_i = {total_original_p:.1f} (harus ≤ {self.B})")
        else:
            print("Tidak ditemukan solusi feasible.")

    # =============================
    # CETAK NODE (Untuk debugging/visualisasi)
    # =============================
    def print_nodes(self):
        print("""\nNode | Parent | Decision   | Fixed_Y                                 | LB       | Status\n------------------------------------------------------------------------------------------""")
        for n_data in self.nodes:
            fx = ", ".join([f"y{i}={v}" for i,v in n_data["Fixed_Y"].items()]) # Reverted to y{i}
            print(f"{n_data['Node']:>4} | {str(n_data['Parent']):>6} | "
                  f"{n_data['Decision']:<10} | {fx:<40} | "
                  f"{n_data['LB']:<8} | {n_data['Status']} {n_data['Reason']}")

In [None]:
P = [
    0.56011612024394, 0.532157896426633, 0.532157896426633,
    0.560130487332721, 0.319457699663552, 0.233660032327584,
    0.2408816335929, 0.475695812636056,
    0.228420627459742, 0.410232143258209
]

p = [26, 65, 48.5, 140, 119, 47.7, 99, 60, 69, 37.5]
k = 3
B = 150

model = BalasBestFirst(P, p, k, B)
model.solve()
model.print_nodes()


=== FINAL RESULT (BEST-FIRST) ===
Total nodes: 21
Minimum Z (Transformed Objective): 2.468478
Optimal y: {9: 1, 6: 1, 7: 1, 5: 1, 10: 1, 8: 1, 2: 0, 3: 0, 1: 0, 4: 1}
Optimal x (0-based): [1, 1, 1, 0, 0, 0, 0, 0, 0, 0]
Selected Items (Original Index, 1-based): [2, 3, 1]
Maximum Objective Value (Original P): 1.624432
Total Cost (Original p): 139.5

Verifikasi: ∑x_i = 3 (harus 3)
Verifikasi: ∑p_i*x_i = 139.5 (harus ≤ 150)

Node | Parent | Decision   | Fixed_Y                                 | LB       | Status
------------------------------------------------------------------------------------------
   0 |   None | root       |                                          | 0        | EXPANDED 
   1 |      0 | y9=1       | y9=1                                     | 2.440506 | EXPANDED 
   2 |      1 | y6=1       | y9=1, y6=1                               | 2.440506 | EXPANDED 
   3 |      2 | y7=1       | y9=1, y6=1, y7=1                         | 2.440506 | EXPANDED 
   4 |      3 | y5=1  

In [None]:
P = [
    0.56011612024394, 0.532157896426633, 0.532157896426633,
    0.560130487332721, 0.319457699663552, 0.233660032327584,
    0.2408816335929, 0.475695812636056,
    0.228420627459742, 0.410232143258209
]

p = [26, 65, 48.5, 140, 119, 47.7, 99, 60, 69, 37.5]
k = 3
B = 300

model = BalasBestFirst(P, p, k, B)
model.solve()
model.print_nodes()


=== FINAL RESULT (BEST-FIRST) ===
Total nodes: 27
Minimum Z (Transformed Objective): 2.440506
Optimal y: {9: 1, 6: 1, 7: 1, 5: 1, 10: 1, 8: 1, 2: 1, 3: 0, 1: 0, 4: 0}
Optimal x (0-based): [1, 0, 1, 1, 0, 0, 0, 0, 0, 0]
Selected Items (Original Index, 1-based): [3, 1, 4]
Maximum Objective Value (Original P): 1.652405
Total Cost (Original p): 214.5

Verifikasi: ∑x_i = 3 (harus 3)
Verifikasi: ∑p_i*x_i = 214.5 (harus ≤ 300)

Node | Parent | Decision   | Fixed_Y                                 | LB       | Status
------------------------------------------------------------------------------------------
   0 |   None | root       |                                          | 0        | EXPANDED 
   1 |      0 | y9=1       | y9=1                                     | 2.440506 | EXPANDED 
   2 |      1 | y6=1       | y9=1, y6=1                               | 2.440506 | EXPANDED 
   3 |      2 | y7=1       | y9=1, y6=1, y7=1                         | 2.440506 | EXPANDED 
   4 |      3 | y5=1  