In [4]:
import os
import pandas as pd
import numpy as np
from datetime import datetime
import re

# Tentukan path folder
source_folder = 'hasil scraping'
output_folder = 'cleaning_data'

# Cek dan buat folder sumber jika belum ada
if not os.path.exists(source_folder):
    print(f"‚ùå Folder '{source_folder}' tidak ditemukan!")
    print("Pastikan folder 'hasil scraping' dengan dataset sudah tersedia.")
    exit()
else:
    print(f"‚úì Folder sumber ditemukan: {source_folder}")

# Cek dan buat folder output secara otomatis
if not os.path.exists(output_folder):
    os.makedirs(output_folder)
    print(f"‚úì Folder '{output_folder}' berhasil dibuat")
else:
    print(f"‚úì Folder '{output_folder}' sudah ada")

print(f"‚úì Membaca dataset dari folder: {source_folder}")
print(f"‚úì Hasil cleaning akan disimpan ke folder: {output_folder}")

# Dictionary untuk menyimpan semua dataset
datasets = {}
# Dictionary untuk tracking proses cleaning
cleaning_summary = {}

print("\n" + "="*60)
print("LOADING DATASETS DARI FOLDER HASIL SCRAPING")
print("="*60)

# Mapping file yang akan dibaca - DIPISAHKAN TIRTA NIRWANA DAN SONGGORITI
file_mappings = {
    'coban_rais': [
        'air_terjun_coban_rais_ALL_reviews_20250802_102141.csv',
        'coban_rais_reviews_20250804_223751.csv'
    ],
    'alun_alun': ['alun_alun_kota_wisata_batu_reviews_20250725_105504.csv'],
    'batu_economis_park': ['batu_economis_park_ALL_reviews_20250802_172842.csv'],
    'batu_love_garden': ['batu_love_garden_baloga_ALL_reviews_20250802_181709.csv'],
    'batu_night_spectacular': ['batu_night_spectacular_ALL_reviews_20250724_154759.csv'],
    'batu_rafting': ['batu_rafting_reviews_20250810_224610.csv'],
    'coban_putri': ['coban_putri_reviews_all_20250803_001704.csv'],
    'coban_talun': ['coban_talun_ALL_reviews_20250724_114716.csv'],
    'desa_wisata_punten': ['desa_wisata_punten_reviews_20250810_230050.csv'],
    'desa_wisata_tulungrejo': ['desa_wisata_tulungrejo_reviews_20250802_013319.csv'],
    'eco_active_park': ['eco_active_park_ALL_reviews_20250725_002543.csv'],
    'gussari_goa_pinus_batu': ['gussari_goa_pinus_batu_ALL_reviews_20250811_021707.csv'],
    'gunung_arjuno': ['gunung_arjuno_reviews_20250804_180540.csv'],
    'gunung_panderman': ['gunung_panderman_reviews_20250724_141423.csv'],
    'jatim_park_1': ['jatim_park_1_ALL_reviews_20250723_140636.csv'],
    'jatim_park_2': ['jatim_park_2_ALL_reviews_20250725_114708.csv'],
    'jatim_park_3': ['jatim_park_3_ALL_reviews_20250725_011845.csv'],
    'lumbung_stroberi': ['lumbung_stroberi_ALL_reviews_20250804_160259.csv'],
    'milenial_glow_garden': ['milenial_glow_garden_ALL_reviews_20250802_201820.csv'],
    'museum_angkut': ['museum_angkut_ALL_reviews_20250725_160008.csv'],
    'paralayang_gunung_banyak': ['paralayang_gunung_banyak_reviews_20250802_235154.csv'],
    'pemandian_air_panas_cangar': ['pemandian_air_panas_cangar_ALL_reviews_20250802_230623.csv'],
    'songgoriti_hot_springs': ['songgoriti_hot_springs_all_reviews_20250804_192319.csv'],
    'taman_dolan': ['taman_dolan_ALL_reviews_20250804_174157.csv'],
    'taman_pinus_campervan': ['taman_pinus_campervan_ALL_reviews_20250804_160757.csv'],
    'taman_selecta': ['taman_selecta_ALL_reviews_20250724_085149.csv'],
    'tirta_nirwana_hotspring': [
        'tirta_nirwana_songgoriti_all_reviews_20250803_011926.csv',
        'tirta_nirwana_songgoriti_all_reviews_newest_20250804_213237.csv'
    ],
    'wisata_bunga_sidomulyo': [
        'wisata_bunga_sidomulyo_reviews_20250802_174108.csv',
        'wisata_bunga_sidomulyo_reviews_20250803_012801.csv',
        'rest_area_desa_wisata_sidomulyo_reviews_20250802_174921.csv'
    ],
    'wisata_desa_agro_bumiaji': ['wisata_desa_agro_bumiaji_reviews_20250804_150849.csv'],
    'wisata_petik_apel_mandiri': ['wisata_petik_apel_mandiri_reviews_20250804_181035.csv']
}

# Load dan gabungkan dataset
for key, filenames in file_mappings.items():
    combined_data = []
    files_loaded = 0
    files_info = []
    
    for filename in filenames:
        file_path = os.path.join(source_folder, filename)
        try:
            df = pd.read_csv(file_path)
            combined_data.append(df)
            files_loaded += 1
            files_info.append(f"{filename} ({len(df)} rows)")
            print(f"‚úì {filename} berhasil dimuat ({len(df)} rows)")
        except FileNotFoundError:
            files_info.append(f"{filename} (FILE NOT FOUND)")
            print(f"‚ùå File tidak ditemukan: {filename}")
        except Exception as e:
            files_info.append(f"{filename} (ERROR: {str(e)})")
            print(f"‚ùå Error loading {filename}: {e}")
    
    # Gabungkan data jika ada file yang berhasil dimuat
    if combined_data:
        if len(combined_data) == 1:
            datasets[key] = combined_data[0]
            cleaning_summary[key] = {
                'files': files_info,
                'files_loaded': files_loaded,
                'original_rows': len(combined_data[0]),
                'status': 'SUCCESS'
            }
        else:
            # Gabungkan multiple files untuk wisata yang sama
            combined_df = pd.concat(combined_data, ignore_index=True)
            original_combined = len(combined_df)
            # Hapus duplikat setelah penggabungan
            combined_df = combined_df.drop_duplicates()
            datasets[key] = combined_df
            cleaning_summary[key] = {
                'files': files_info,
                'files_loaded': files_loaded,
                'original_rows': original_combined,
                'after_dedup_rows': len(combined_df),
                'duplicates_removed': original_combined - len(combined_df),
                'status': 'SUCCESS (MERGED)'
            }
            print(f"  ‚Üí Digabung menjadi {len(combined_df)} rows (duplikat dihapus: {original_combined - len(combined_df)})")
    else:
        cleaning_summary[key] = {
            'files': files_info,
            'files_loaded': 0,
            'original_rows': 0,
            'status': 'FAILED'
        }

if len(datasets) == 0:
    print("‚ùå Tidak ada dataset yang berhasil dimuat!")
    exit()

print(f"\n‚úì Total dataset berhasil dimuat: {len(datasets)}")

# Fungsi untuk membersihkan teks dari Google Translate dan noise lainnya
def clean_review_text(text):
    """
    Membersihkan teks review dari Google Translate markers dan noise lainnya
    TANPA menghapus review pendek - semua review dipertahankan
    """
    if pd.isna(text) or text == '':
        return ''
    
    text = str(text)
    
    # Pola-pola yang akan dihapus
    patterns_to_remove = [
        # Google Translate markers
        r'Diterjemahkan oleh Google\s*„Éª?\s*',
        r'Lihat versi asli\s*\([^)]*\)\s*',
        r'Diterjemahkan oleh Google\s*',
        r'Translated by Google\s*„Éª?\s*',
        r'See original\s*\([^)]*\)\s*',
        r'Translated by Google\s*',
        
        # Other translation markers
        r'Terjemahan Google\s*',
        r'Google Translation\s*',
        r'Auto-translated\s*',
        r'Machine translated\s*',
        
        # Language indicators yang tidak jelas
        r'\(Tionghoa\)\s*',
        r'\(Chinese\)\s*',
        r'\(English\)\s*',
        r'\(Bahasa Indonesia\)\s*',
        r'\(Indonesian\)\s*',
        r'\([A-Za-z\s]+\)\s*$',  # Hapus tanda kurung bahasa di akhir
        
        # Marker lainnya
        r'„Éª\s*',
        r'‚Ä¶\s*$',  # Ellipsis di akhir
        r'\s*\.\.\.\s*$',  # Triple dots di akhir
        
        # Multiple spaces dan newlines
        r'\s+',
        r'\n+',
    ]
    
    # Apply semua pattern
    for pattern in patterns_to_remove[:-2]:  # Kecuali 2 terakhir (spaces & newlines)
        text = re.sub(pattern, '', text, flags=re.IGNORECASE)
    
    # Handle multiple spaces dan newlines terakhir
    text = re.sub(r'\s+', ' ', text)  # Multiple spaces jadi single space
    text = re.sub(r'\n+', ' ', text)  # Newlines jadi space
    
    # Clean up
    text = text.strip()
    
    # TIDAK ada filter panjang minimum - semua review dipertahankan
    return text

def clean_reviewer_name(name):
    """
    Membersihkan nama reviewer dari noise
    """
    if pd.isna(name) or name == '':
        return ''
    
    name = str(name).strip()
    
    # Hapus Google Translate markers dari nama
    patterns = [
        r'Diterjemahkan oleh Google\s*',
        r'Translated by Google\s*',
        r'„Éª\s*',
    ]
    
    for pattern in patterns:
        name = re.sub(pattern, '', name, flags=re.IGNORECASE)
    
    name = name.strip()
    
    # Jika nama terlalu pendek atau aneh, kosongkan
    if len(name) < 2 or re.match(r'^[^\w]*$', name):
        return ''
    
    return name

# Fungsi untuk membersihkan dan standarisasi data
def clean_dataset(df, dataset_name, dataset_key):
    """
    Fungsi untuk membersihkan dataset review wisata
    SEMUA REVIEW DIPERTAHANKAN - tidak ada filter panjang minimum
    """
    print(f"\n{'='*50}")
    print(f"Cleaning dataset: {dataset_name}")
    print(f"{'='*50}")
    
    # Informasi awal
    print(f"Jumlah data awal: {len(df)}")
    print(f"Kolom: {list(df.columns)}")
    
    # Update cleaning summary dengan info awal
    cleaning_summary[dataset_key]['columns'] = list(df.columns)
    cleaning_summary[dataset_key]['cleaning_start_rows'] = len(df)
    
    # 1. Hapus duplikat
    df_clean = df.drop_duplicates()
    duplicates_removed = len(df) - len(df_clean)
    if duplicates_removed > 0:
        print(f"‚úì Duplikat dihapus: {duplicates_removed}")
    
    cleaning_summary[dataset_key]['duplicates_in_cleaning'] = duplicates_removed
    
    # 2. Clean review text dari Google Translate markers dan noise
    reviews_cleaned = 0
    if 'review_text' in df_clean.columns:
        print("‚úì Membersihkan teks review dari Google Translate markers...")
        
        # Tampilkan sample sebelum cleaning
        sample_before = df_clean['review_text'].dropna().iloc[:3].tolist()
        print("   Sample sebelum cleaning:")
        for i, sample in enumerate(sample_before, 1):
            preview = str(sample)[:100] + "..." if len(str(sample)) > 100 else str(sample)
            print(f"     {i}. {preview}")
        
        # Count reviews yang mengandung Google Translate markers
        before_cleaning = df_clean['review_text'].astype(str)
        google_translate_count = before_cleaning.str.contains('Diterjemahkan oleh Google|Translated by Google', case=False, na=False).sum()
        
        df_clean['review_text'] = df_clean['review_text'].apply(clean_review_text)
        reviews_cleaned = google_translate_count
        
        # Tampilkan sample setelah cleaning
        sample_after = df_clean['review_text'].dropna().iloc[:3].tolist()
        print("   Sample setelah cleaning:")
        for i, sample in enumerate(sample_after, 1):
            preview = str(sample)[:100] + "..." if len(str(sample)) > 100 else str(sample)
            print(f"     {i}. {preview}")
        
        print(f"   Reviews dengan Google Translate markers: {reviews_cleaned}")
    
    cleaning_summary[dataset_key]['google_translate_cleaned'] = reviews_cleaned
    
    # 3. Clean reviewer names
    names_cleaned = 0
    if 'reviewer_name' in df_clean.columns:
        print("‚úì Membersihkan nama reviewer...")
        before_name_cleaning = df_clean['reviewer_name'].astype(str)
        google_translate_names = before_name_cleaning.str.contains('Diterjemahkan oleh Google|Translated by Google', case=False, na=False).sum()
        
        df_clean['reviewer_name'] = df_clean['reviewer_name'].apply(clean_reviewer_name)
        names_cleaned = google_translate_names
    
    cleaning_summary[dataset_key]['names_cleaned'] = names_cleaned
    
    # 4. Tangani missing values
    missing_before = df_clean.isnull().sum().sum()
    
    # Untuk kolom teks, isi dengan string kosong
    text_columns = ['reviewer_name', 'review_text']
    for col in text_columns:
        if col in df_clean.columns:
            df_clean[col] = df_clean[col].fillna('')
    
    # 5. Bersihkan dan standarisasi visit_time - HANYA 4 KATEGORI
    visit_time_standardized = 0
    if 'visit_time' in df_clean.columns:
        def clean_visit_time(visit_time):
            if pd.isna(visit_time) or visit_time == '' or str(visit_time).strip() == '':
                return 'Tidak diketahui'
            
            visit_time_str = str(visit_time).strip().lower()
            
            # Clean dari Google Translate markers dulu
            visit_time_str = re.sub(r'diterjemahkan oleh google\s*„Éª?\s*', '', visit_time_str)
            visit_time_str = re.sub(r'translated by google\s*„Éª?\s*', '', visit_time_str)
            visit_time_str = visit_time_str.strip()
            
            # HANYA 3 kategori utama yang diizinkan
            if visit_time_str in ['hari biasa', 'weekday', 'weekdays']:
                return 'Hari biasa'
            elif visit_time_str in ['akhir pekan', 'weekend', 'akhir p', 'akhir pekan‚Ä¶', 'akhir p‚Ä¶', 'weekends']:
                return 'Akhir pekan'
            elif visit_time_str in ['hari libur nasional', 'libur nasional', 'hari libur nas', 'hari libur nasional‚Ä¶', 'hari libur nas‚Ä¶', 'public holiday', 'national holiday']:
                return 'Hari libur nasional'
            else:
                # Semua yang lain menjadi 'Tidak diketahui'
                return 'Tidak diketahui'
        
        # Count berapa banyak yang perlu standardisasi
        before_visit_cleaning = df_clean['visit_time'].dropna()
        visit_time_standardized = len(before_visit_cleaning)
        
        # Apply cleaning function
        df_clean['visit_time'] = df_clean['visit_time'].apply(clean_visit_time)
        print(f"‚úì Visit time distandardisasi ke 3 kategori + Tidak diketahui")
        
        # Tampilkan distribusi visit_time setelah cleaning
        visit_time_dist = df_clean['visit_time'].value_counts()
        print(f"   Distribusi visit_time setelah cleaning:")
        for visit_type, count in visit_time_dist.items():
            percentage = (count / len(df_clean)) * 100
            print(f"     '{visit_type}': {count} ({percentage:.1f}%)")
    
    cleaning_summary[dataset_key]['visit_time_standardized'] = visit_time_standardized
    
    missing_after = df_clean.isnull().sum().sum()
    if missing_before > 0:
        print(f"‚úì Missing values ditangani: {missing_before} ‚Üí {missing_after}")
    
    cleaning_summary[dataset_key]['missing_values_handled'] = missing_before - missing_after
    
    # 6. Standarisasi rating (pastikan integer 1-5)
    ratings_standardized = 0
    if 'rating' in df_clean.columns:
        before_rating = df_clean['rating'].copy()
        df_clean['rating'] = pd.to_numeric(df_clean['rating'], errors='coerce')
        df_clean['rating'] = df_clean['rating'].fillna(0).astype(int)
        df_clean['rating'] = df_clean['rating'].clip(1, 5)  # Batasi antara 1-5
        ratings_standardized = len(before_rating)
        print(f"‚úì Rating distandardisasi (1-5)")
    
    cleaning_summary[dataset_key]['ratings_standardized'] = ratings_standardized
    
    # 7. Standarisasi date format
    dates_standardized = 0
    if 'date' in df_clean.columns:
        # Konversi date relative ke format standar
        def standardize_date(date_str):
            if pd.isna(date_str):
                return 'Tidak diketahui'
            
            date_str = str(date_str).lower()
            
            # Clean dari Google Translate markers
            date_str = re.sub(r'diterjemahkan oleh google\s*„Éª?\s*', '', date_str)
            date_str = re.sub(r'translated by google\s*„Éª?\s*', '', date_str)
            date_str = date_str.strip()
            
            # Mapping untuk konversi
            if 'hari lalu' in date_str or 'days ago' in date_str:
                return 'Beberapa hari lalu'
            elif 'minggu lalu' in date_str or 'seminggu lalu' in date_str or 'week ago' in date_str or 'weeks ago' in date_str:
                num = re.search(r'(\d+)', date_str)
                if num:
                    return f'{num.group(1)} minggu lalu'
                return '1 minggu lalu'
            elif 'bulan lalu' in date_str or 'sebulan lalu' in date_str or 'month ago' in date_str or 'months ago' in date_str:
                num = re.search(r'(\d+)', date_str)
                if num:
                    return f'{num.group(1)} bulan lalu'
                return '1 bulan lalu'
            elif 'tahun lalu' in date_str or 'setahun lalu' in date_str or 'year ago' in date_str or 'years ago' in date_str:
                num = re.search(r'(\d+)', date_str)
                if num:
                    return f'{num.group(1)} tahun lalu'
                return '1 tahun lalu'
            else:
                return date_str
        
        dates_standardized = df_clean['date'].dropna().count()
        df_clean['date'] = df_clean['date'].apply(standardize_date)
    
    cleaning_summary[dataset_key]['dates_standardized'] = dates_standardized
    
    # 8. TIDAK ADA FILTER - SEMUA REVIEW DIPERTAHANKAN
    # Hanya hitung review kosong untuk statistik
    empty_reviews = (df_clean['review_text'] == '').sum()
    very_short_reviews = (df_clean['review_text'].str.len() <= 5).sum()
    
    cleaning_summary[dataset_key]['empty_reviews'] = empty_reviews
    cleaning_summary[dataset_key]['very_short_reviews'] = very_short_reviews
    cleaning_summary[dataset_key]['short_reviews_removed'] = 0  # Tidak ada yang dihapus
    
    print(f"‚úì Semua review dipertahankan (tidak ada filter panjang minimum)")
    print(f"   Review kosong: {empty_reviews}")
    print(f"   Review sangat pendek (‚â§5 karakter): {very_short_reviews}")
    
    # 9. Tambahkan kolom wisata
    df_clean['wisata'] = dataset_name
    
    # Update final info
    cleaning_summary[dataset_key]['final_rows'] = len(df_clean)
    cleaning_summary[dataset_key]['retention_rate'] = (len(df_clean) / cleaning_summary[dataset_key]['cleaning_start_rows']) * 100 if cleaning_summary[dataset_key]['cleaning_start_rows'] > 0 else 0
    
    print(f"\nJumlah data setelah cleaning: {len(df_clean)}")
    if len(df) > 0:
        print(f"Persentase data yang tersisa: {len(df_clean)/len(df)*100:.1f}%")
    
    return df_clean

# Mapping nama dataset ke nama wisata yang lebih readable - TOTAL 30 WISATA
wisata_names = {
    'alun_alun': 'Alun Alun Kota Wisata Batu',
    'batu_economis_park': 'Batu Economis Park',
    'batu_love_garden': 'Batu Love Garden (Baloga)',
    'batu_night_spectacular': 'Batu Night Spectacular',
    'batu_rafting': 'Batu Rafting',
    'coban_putri': 'Coban Putri',
    'coban_rais': 'Air Terjun Coban Rais',
    'coban_talun': 'Coban Talun',
    'desa_wisata_punten': 'Desa Wisata Punten',
    'desa_wisata_tulungrejo': 'Desa Wisata Tulungrejo',
    'eco_active_park': 'Eco Active Park',
    'gussari_goa_pinus_batu': 'Gussari Goa Pinus Batu',
    'gunung_arjuno': 'Gunung Arjuno',
    'gunung_panderman': 'Gunung Panderman',
    'jatim_park_1': 'Jatim Park 1',
    'jatim_park_2': 'Jatim Park 2',
    'jatim_park_3': 'Jatim Park 3',
    'lumbung_stroberi': 'Lumbung Stroberi',
    'milenial_glow_garden': 'Milenial Glow Garden',
    'museum_angkut': 'Museum Angkut',
    'paralayang_gunung_banyak': 'Paralayang Gunung Banyak',
    'pemandian_air_panas_cangar': 'Pemandian Air Panas Cangar',
    'songgoriti_hot_springs': 'Songgoriti Hot Springs',
    'taman_dolan': 'Taman Dolan',
    'taman_pinus_campervan': 'Taman Pinus Campervan',
    'taman_selecta': 'Taman Selecta',
    'tirta_nirwana_hotspring': 'Tirta Nirwana Hotspring',
    'wisata_bunga_sidomulyo': 'Wisata Bunga Sidomulyo',
    'wisata_desa_agro_bumiaji': 'Wisata Desa Agro Bumiaji',
    'wisata_petik_apel_mandiri': 'Wisata Petik Apel Mandiri'
}

print("\n" + "="*60)
print("PROSES CLEANING DATASETS")
print("="*60)

# Clean semua dataset
cleaned_datasets = {}
for key, df in datasets.items():
    cleaned_df = clean_dataset(df, wisata_names[key], key)
    cleaned_datasets[key] = cleaned_df

# Gabungkan semua dataset yang sudah dibersihkan
if cleaned_datasets:
    all_reviews = pd.concat(cleaned_datasets.values(), ignore_index=True)
    
    print("\n" + "="*60)
    print("RINGKASAN DATASET GABUNGAN")
    print("="*60)
    print(f"Total review: {len(all_reviews)}")
    print(f"Total wisata: {len(all_reviews['wisata'].unique())}")
    
    # Validasi jumlah wisata
    if len(all_reviews['wisata'].unique()) == 30:
        print("‚úì KONFIRMASI: Total wisata sudah sesuai = 30 wisata")
    else:
        print(f"‚ö†Ô∏è  PERINGATAN: Total wisata tidak sesuai! Expected: 30, Actual: {len(all_reviews['wisata'].unique())}")
        print("   Daftar wisata yang ada:")
        for i, wisata in enumerate(sorted(all_reviews['wisata'].unique()), 1):
            print(f"     {i:2d}. {wisata}")
    
    print(f"\nDistribusi review per wisata (Top 10):")
    top_10_wisata = all_reviews['wisata'].value_counts().head(10)
    for wisata, count in top_10_wisata.items():
        percentage = (count / len(all_reviews)) * 100
        print(f"  {wisata}: {count} ({percentage:.1f}%)")

    print(f"\nDistribusi rating:")
    rating_dist = all_reviews['rating'].value_counts().sort_index()
    for rating, count in rating_dist.items():
        percentage = (count / len(all_reviews)) * 100
        print(f"  Rating {rating}: {count} ({percentage:.1f}%)")

    print(f"\nRata-rata rating keseluruhan: {all_reviews['rating'].mean():.2f}")

    # Cek kolom yang ada
    print(f"\nKolom yang tersedia:")
    print(all_reviews.columns.tolist())

    # Statistik visit_time - hanya 4 kategori
    if 'visit_time' in all_reviews.columns:
        print(f"\nDistribusi waktu kunjungan (4 kategori):")
        visit_dist = all_reviews['visit_time'].value_counts()
        for visit_type, count in visit_dist.items():
            percentage = (count / len(all_reviews)) * 100
            print(f"  '{visit_type}': {count} ({percentage:.1f}%)")
        
        # Validasi bahwa hanya ada 4 kategori
        unique_categories = all_reviews['visit_time'].dropna().unique()
        expected_categories = ['Hari biasa', 'Akhir pekan', 'Hari libur nasional', 'Tidak diketahui']
        print(f"\nValidasi kategori visit_time:")
        print(f"  Kategori yang ada: {sorted(unique_categories)}")
        print(f"  ‚úì Sesuai ekspektasi: {set(unique_categories) == set(expected_categories)}")
        
        # Cek apakah ada NaN yang tersisa
        nan_count = all_reviews['visit_time'].isna().sum()
        if nan_count > 0:
            print(f"  ‚ö†Ô∏è  Masih ada {nan_count} nilai NaN yang perlu ditangani")

    # Statistik cleaning
    print(f"\nStatistik cleaning:")
    empty_reviews = (all_reviews['review_text'] == '').sum()
    print(f"  Review kosong setelah cleaning: {empty_reviews}")
    very_short_reviews = (all_reviews['review_text'].str.len() <= 5).sum()
    print(f"  Review sangat pendek (‚â§5 char): {very_short_reviews}")
    short_reviews = (all_reviews['review_text'].str.len() < 50).sum()
    print(f"  Review pendek (<50 char): {short_reviews}")
    avg_length = all_reviews['review_text'].str.len().mean()
    print(f"  Rata-rata panjang review: {avg_length:.0f} karakter")

    print("\n" + "="*60)
    print("MENYIMPAN DATASET KE FOLDER CLEANING_DATA")
    print("="*60)

    # Pastikan tidak ada NaN di visit_time sebelum menyimpan
    if 'visit_time' in all_reviews.columns:
        all_reviews['visit_time'] = all_reviews['visit_time'].fillna('Tidak diketahui')
        print("‚úì NaN di visit_time diubah menjadi 'Tidak diketahui'")

    # Simpan dataset gabungan yang sudah bersih
    combined_filename = os.path.join(output_folder, 'combined_batu_tourism_reviews_cleaned.csv')
    all_reviews.to_csv(combined_filename, index=False)
    print(f"‚úì Dataset gabungan disimpan: {combined_filename}")

    # Buat laporan kualitas data dengan detail cleaning
    def data_quality_report(df, cleaning_summary):
        """
        Generate laporan kualitas data dengan detail cleaning
        """
        report = []
        report.append("="*80)
        report.append("LAPORAN KUALITAS DATA - BATU TOURISM REVIEWS")
        report.append("="*80)
        report.append(f"Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
        report.append(f"Total reviews: {len(df):,}")
        report.append(f"Total wisata: {len(df['wisata'].unique())}")
        
        # Validasi 30 wisata
        if len(df['wisata'].unique()) == 30:
            report.append("‚úì KONFIRMASI: Total wisata = 30 wisata (sesuai target)")
        else:
            report.append(f"‚ö†Ô∏è  PERINGATAN: Total wisata = {len(df['wisata'].unique())} (target: 30)")
        
        # BAGIAN BARU: Detail proses cleaning per wisata
        report.append("\n" + "="*80)
        report.append("DETAIL PROSES CLEANING PER WISATA")
        report.append("="*80)
        report.append("CATATAN: SEMUA REVIEW DIPERTAHANKAN - TIDAK ADA FILTER PANJANG MINIMUM")
        
        # Hitung total statistik cleaning
        total_files = 0
        total_original_rows = 0
        total_final_rows = 0
        total_google_translate = 0
        total_names_cleaned = 0
        total_duplicates = 0
        total_empty_reviews = 0
        total_very_short_reviews = 0
        successful_cleanings = 0
        failed_cleanings = 0
        
        for key, summary in cleaning_summary.items():
            if summary['status'].startswith('SUCCESS'):
                successful_cleanings += 1
                total_files += summary['files_loaded']
                total_original_rows += summary.get('original_rows', 0)
                total_final_rows += summary.get('final_rows', 0)
                total_google_translate += summary.get('google_translate_cleaned', 0)
                total_names_cleaned += summary.get('names_cleaned', 0)
                total_duplicates += summary.get('duplicates_in_cleaning', 0)
                total_empty_reviews += summary.get('empty_reviews', 0)
                total_very_short_reviews += summary.get('very_short_reviews', 0)
            else:
                failed_cleanings += 1
        
        report.append(f"\nRINGKASAN PROSES CLEANING:")
        report.append(f"‚úì Wisata berhasil di-cleaning: {successful_cleanings}")
        report.append(f"‚úó Wisata gagal di-cleaning: {failed_cleanings}")
        report.append(f"‚úì Total file CSV diproses: {total_files}")
        report.append(f"‚úì Total rows awal: {total_original_rows:,}")
        report.append(f"‚úì Total rows akhir: {total_final_rows:,}")
        report.append(f"‚úì Total retention rate: {(total_final_rows/total_original_rows*100):.1f}%")
        report.append(f"‚úì Reviews dengan Google Translate markers dibersihkan: {total_google_translate:,}")
        report.append(f"‚úì Nama reviewer dibersihkan: {total_names_cleaned:,}")
        report.append(f"‚úì Duplikat dihapus: {total_duplicates:,}")
        report.append(f"‚úì Review kosong: {total_empty_reviews:,}")
        report.append(f"‚úì Review sangat pendek (‚â§5 char): {total_very_short_reviews:,}")
        report.append(f"‚úì Review pendek/kosong dihapus: 0 (SEMUA DIPERTAHANKAN)")
        
        report.append(f"\nDETAIL PER WISATA:")
        report.append("-" * 80)
        
        # Sort berdasarkan nama wisata
        sorted_keys = sorted(cleaning_summary.keys(), key=lambda x: wisata_names.get(x, x))
        
        for i, key in enumerate(sorted_keys, 1):
            summary = cleaning_summary[key]
            wisata_name = wisata_names.get(key, key)
            
            report.append(f"\n{i:2d}. {wisata_name}")
            report.append(f"    Status: {summary['status']}")
            
            if summary['status'].startswith('SUCCESS'):
                report.append(f"    File(s): {summary['files_loaded']} file(s) berhasil dimuat")
                for file_info in summary['files']:
                    report.append(f"      - {file_info}")
                
                if 'after_dedup_rows' in summary:
                    report.append(f"    Rows awal (gabungan): {summary['original_rows']:,}")
                    report.append(f"    Setelah dedup gabungan: {summary['after_dedup_rows']:,}")
                    report.append(f"    Duplikat saat gabung: {summary['duplicates_removed']:,}")
                else:
                    report.append(f"    Rows awal: {summary['original_rows']:,}")
                
                report.append(f"    Rows akhir: {summary['final_rows']:,}")
                report.append(f"    Retention rate: {summary.get('retention_rate', 0):.1f}%")
                report.append(f"    Kolom: {', '.join(summary.get('columns', []))}")
                
                # Detail cleaning
                cleaning_details = []
                if summary.get('google_translate_cleaned', 0) > 0:
                    cleaning_details.append(f"Google Translate: {summary['google_translate_cleaned']}")
                if summary.get('names_cleaned', 0) > 0:
                    cleaning_details.append(f"Nama: {summary['names_cleaned']}")
                if summary.get('duplicates_in_cleaning', 0) > 0:
                    cleaning_details.append(f"Duplikat: {summary['duplicates_in_cleaning']}")
                if summary.get('empty_reviews', 0) > 0:
                    cleaning_details.append(f"Review kosong: {summary['empty_reviews']}")
                if summary.get('very_short_reviews', 0) > 0:
                    cleaning_details.append(f"Review ‚â§5 char: {summary['very_short_reviews']}")
                
                if cleaning_details:
                    report.append(f"    Cleaning: {', '.join(cleaning_details)}")
                
                report.append(f"    ‚úì SEMUA REVIEW DIPERTAHANKAN")
                
            else:
                report.append(f"    ‚ùå GAGAL: Tidak ada file yang berhasil dimuat")
                report.append(f"    File(s) yang dicoba:")
                for file_info in summary['files']:
                    report.append(f"      - {file_info}")
        
        # Daftar semua wisata dalam dataset final
        report.append(f"\n" + "="*80)
        report.append("DAFTAR WISATA DALAM DATASET FINAL")
        report.append("="*80)
        
        wisata_final_counts = df['wisata'].value_counts().sort_index()
        for i, (wisata, count) in enumerate(wisata_final_counts.items(), 1):
            percentage = (count / len(df)) * 100
            avg_rating = df[df['wisata'] == wisata]['rating'].mean()
            report.append(f"{i:2d}. {wisata}")
            report.append(f"    Reviews: {count:,} ({percentage:.1f}%)")
            report.append(f"    Avg Rating: {avg_rating:.2f}")
        
        # 1. Completeness
        report.append(f"\n" + "="*80)
        report.append("1. KELENGKAPAN DATA")
        report.append("="*80)
        for col in df.columns:
            missing_pct = (df[col].isnull().sum() / len(df)) * 100
            empty_pct = ((df[col] == '').sum() / len(df)) * 100
            complete_pct = 100 - missing_pct - empty_pct
            report.append(f"   {col}: {complete_pct:.1f}% lengkap")
        
        # 2. Consistency
        report.append(f"\n2. KONSISTENSI DATA")
        report.append("-" * 40)
        report.append(f"   Rating dalam range 1-5: {((df['rating'] >= 1) & (df['rating'] <= 5)).all()}")
        
        if 'visit_time' in df.columns:
            unique_visit_times = df['visit_time'].nunique()
            report.append(f"   Nilai unik visit_time: {unique_visit_times}")
            categories = [cat for cat in df['visit_time'].unique() if pd.notna(cat)]
            report.append(f"   Kategori visit_time: {sorted(categories)}")
        
        # 3. Data distribution
        report.append(f"\n3. DISTRIBUSI DATA")
        report.append("-" * 40)
        report.append(f"   Rata-rata panjang review: {df['review_text'].str.len().mean():.0f} karakter")
        report.append(f"   Review terpendek: {df['review_text'].str.len().min()} karakter")
        report.append(f"   Review terpanjang: {df['review_text'].str.len().max()} karakter")
        report.append(f"   Rata-rata rating keseluruhan: {df['rating'].mean():.2f}")
        
        # 4. Visit time distribution (hanya 4 kategori)
        if 'visit_time' in df.columns:
            report.append(f"\n4. DISTRIBUSI WAKTU KUNJUNGAN")
            report.append("-" * 40)
            visit_counts = df['visit_time'].value_counts()
            for visit_type in ['Hari biasa', 'Akhir pekan', 'Hari libur nasional', 'Tidak diketahui']:
                count = visit_counts.get(visit_type, 0)
                percentage = (count / len(df)) * 100
                report.append(f"   '{visit_type}': {count:,} ({percentage:.1f}%)")
        
        # 5. Rating distribution
        report.append(f"\n5. DISTRIBUSI RATING")
        report.append("-" * 40)
        rating_counts = df['rating'].value_counts().sort_index()
        for rating, count in rating_counts.items():
            percentage = (count / len(df)) * 100
            report.append(f"   Rating {rating}: {count:,} ({percentage:.1f}%)")
        
        # 6. Text quality after cleaning
        report.append(f"\n6. KUALITAS TEKS SETELAH CLEANING")
        report.append("-" * 40)
        empty_reviews = (df['review_text'] == '').sum()
        report.append(f"   Review kosong: {empty_reviews:,} ({empty_reviews/len(df)*100:.1f}%)")
        
        very_short_reviews = (df['review_text'].str.len() <= 5).sum()
        report.append(f"   Review ‚â§ 5 karakter: {very_short_reviews:,} ({very_short_reviews/len(df)*100:.1f}%)")
        
        short_reviews = (df['review_text'].str.len() < 50).sum()
        report.append(f"   Review < 50 karakter: {short_reviews:,} ({short_reviews/len(df)*100:.1f}%)")
        
        very_long_reviews = (df['review_text'].str.len() > 1000).sum()
        report.append(f"   Review > 1000 karakter: {very_long_reviews:,} ({very_long_reviews/len(df)*100:.1f}%)")
        
        empty_names = (df['reviewer_name'] == '').sum()
        report.append(f"   Nama reviewer kosong: {empty_names:,} ({empty_names/len(df)*100:.1f}%)")
        
        # 7. Top 10 wisata berdasarkan jumlah review
        report.append(f"\n7. TOP 10 WISATA (BERDASARKAN JUMLAH REVIEW)")
        report.append("-" * 40)
        top_wisata = df['wisata'].value_counts().head(10)
        for i, (wisata, count) in enumerate(top_wisata.items(), 1):
            percentage = (count / len(df)) * 100
            avg_rating = df[df['wisata'] == wisata]['rating'].mean()
            report.append(f"   {i:2d}. {wisata}: {count:,} reviews ({percentage:.1f}%) - Avg Rating: {avg_rating:.2f}")
        
        # 8. Top 10 wisata berdasarkan rating tertinggi (min 10 reviews)
        report.append(f"\n8. TOP 10 WISATA (BERDASARKAN RATING TERTINGGI, MIN 10 REVIEWS)")
        report.append("-" * 40)
        wisata_stats = df.groupby('wisata').agg({
            'rating': ['mean', 'count'],
            'review_text': lambda x: x.str.len().mean()
        }).round(2)
        
        wisata_stats.columns = ['avg_rating', 'review_count', 'avg_review_length']
        wisata_stats = wisata_stats[wisata_stats['review_count'] >= 10]
        top_rated = wisata_stats.sort_values('avg_rating', ascending=False).head(10)
        
        for i, (wisata, stats) in enumerate(top_rated.iterrows(), 1):
            report.append(f"   {i:2d}. {wisata}: {stats['avg_rating']:.2f} rating ({int(stats['review_count'])} reviews)")
        
        return '\n'.join(report)

    # Generate dan simpan laporan kualitas
    quality_report = data_quality_report(all_reviews, cleaning_summary)
    print("\n" + quality_report)

    # Simpan laporan ke file
    report_filename = os.path.join(output_folder, 'data_quality_report.txt')
    with open(report_filename, 'w', encoding='utf-8') as f:
        f.write(quality_report)
    print(f"\n‚úì Laporan kualitas data disimpan: {report_filename}")

    # Buat summary statistics dan simpan
    summary_data = []
    for wisata in sorted(all_reviews['wisata'].unique()):
        wisata_data = all_reviews[all_reviews['wisata'] == wisata]
        
        summary_row = {
            'Wisata': wisata,
            'Total_Reviews': len(wisata_data),
            'Avg_Rating': round(wisata_data['rating'].mean(), 2),
            'Avg_Review_Length': round(wisata_data['review_text'].str.len().mean(), 0),
            'Min_Rating': wisata_data['rating'].min(),
            'Max_Rating': wisata_data['rating'].max(),
            'Empty_Reviews': (wisata_data['review_text'] == '').sum(),
            'Very_Short_Reviews': (wisata_data['review_text'].str.len() <= 5).sum(),
            'Short_Reviews': (wisata_data['review_text'].str.len() < 50).sum()
        }
        
        # Tambahkan distribusi rating
        rating_counts = wisata_data['rating'].value_counts()
        for rating in [1, 2, 3, 4, 5]:
            summary_row[f'Rating_{rating}'] = rating_counts.get(rating, 0)
        
        # Tambahkan distribusi visit_time (4 kategori saja)
        if 'visit_time' in wisata_data.columns:
            visit_counts = wisata_data['visit_time'].value_counts()
            summary_row['Hari_Biasa'] = visit_counts.get('Hari biasa', 0)
            summary_row['Akhir_Pekan'] = visit_counts.get('Akhir pekan', 0)
            summary_row['Hari_Libur_Nasional'] = visit_counts.get('Hari libur nasional', 0)
            summary_row['Tidak_Diketahui'] = visit_counts.get('Tidak diketahui', 0)
        
        summary_data.append(summary_row)
    
    summary_stats = pd.DataFrame(summary_data)
    # Urutkan berdasarkan jumlah review (descending)
    summary_stats = summary_stats.sort_values('Total_Reviews', ascending=False)
    
    summary_filename = os.path.join(output_folder, 'summary_statistics.csv')
    summary_stats.to_csv(summary_filename, index=False)
    print(f"‚úì Summary statistik disimpan: {summary_filename}")

    # Tampilkan ringkasan akhir
    print(f"\n" + "="*60)
    print("RINGKASAN HASIL CLEANING")
    print("="*60)
    print(f"‚úì Total reviews berhasil diproses: {len(all_reviews):,}")
    print(f"‚úì Total wisata: {len(all_reviews['wisata'].unique())}")
    
    # Konfirmasi 30 wisata
    if len(all_reviews['wisata'].unique()) == 30:
        print("‚úÖ KONFIRMASI: Total wisata = 30 (SESUAI TARGET)")
    else:
        print(f"‚ùå PERINGATAN: Total wisata = {len(all_reviews['wisata'].unique())} (TARGET: 30)")
    
    print(f"‚úì Rata-rata rating keseluruhan: {all_reviews['rating'].mean():.2f}")
    print(f"‚úì Rata-rata panjang review: {all_reviews['review_text'].str.len().mean():.0f} karakter")
    
    # Statistik cleaning
    empty_after = (all_reviews['review_text'] == '').sum()
    very_short_after = (all_reviews['review_text'].str.len() <= 5).sum()
    print(f"‚úì Review kosong setelah cleaning: {empty_after}")
    print(f"‚úì Review sangat pendek (‚â§5 char): {very_short_after}")
    print(f"‚úì SEMUA REVIEW DIPERTAHANKAN - tidak ada yang dihapus karena terlalu pendek")
    
    # Top 5 wisata berdasarkan jumlah review
    print(f"\n‚úì Top 5 wisata berdasarkan jumlah review:")
    top_5 = all_reviews['wisata'].value_counts().head(5)
    for i, (wisata, count) in enumerate(top_5.items(), 1):
        avg_rating = all_reviews[all_reviews['wisata'] == wisata]['rating'].mean()
        print(f"   {i}. {wisata}: {count} reviews (Rating: {avg_rating:.2f})")
    
    # Konfirmasi pemisahan Tirta Nirwana dan Songgoriti
    tirta_count = len(all_reviews[all_reviews['wisata'] == 'Tirta Nirwana Hotspring'])
    songgoriti_count = len(all_reviews[all_reviews['wisata'] == 'Songgoriti Hot Springs'])
    punten_count = len(all_reviews[all_reviews['wisata'] == 'Desa Wisata Punten'])
    print(f"\n‚úì Konfirmasi wisata:")
    print(f"   - Tirta Nirwana Hotspring: {tirta_count} reviews")
    print(f"   - Songgoriti Hot Springs: {songgoriti_count} reviews")
    print(f"   - Desa Wisata Punten: {punten_count} reviews (sekarang dipertahankan)")
    
    print(f"\n‚úì Files tersimpan di folder '{output_folder}':")
    print(f"   1. combined_batu_tourism_reviews_cleaned.csv")
    print(f"   2. data_quality_report.txt")
    print(f"   3. summary_statistics.csv")
    
else:
    print("‚ùå Tidak ada dataset yang berhasil dimuat dan dibersihkan!")

print(f"\n{'='*60}")
print("PROSES CLEANING SELESAI")
print(f"{'='*60}")
print(f"‚úì Folder output: {output_folder}")
print(f"‚úì Dataset gabungan: combined_batu_tourism_reviews_cleaned.csv")
print(f"‚úì Laporan kualitas: data_quality_report.txt")
print(f"‚úì Summary statistik: summary_statistics.csv")
print(f"‚úì Total wisata: 30 (termasuk Tirta Nirwana Hotspring yang terpisah)")
print(f"‚úì Visit time categories: 4 kategori (Hari biasa, Akhir pekan, Hari libur nasional, Tidak diketahui)")
print(f"‚úì Text cleaning: Google Translate markers dan noise dihapus")
print(f"‚úì SEMUA REVIEW DIPERTAHANKAN - tidak ada filter panjang minimum")

‚úì Folder sumber ditemukan: hasil scraping
‚úì Folder 'cleaning_data' sudah ada
‚úì Membaca dataset dari folder: hasil scraping
‚úì Hasil cleaning akan disimpan ke folder: cleaning_data

LOADING DATASETS DARI FOLDER HASIL SCRAPING
‚úì air_terjun_coban_rais_ALL_reviews_20250802_102141.csv berhasil dimuat (3358 rows)
‚úì coban_rais_reviews_20250804_223751.csv berhasil dimuat (25 rows)
  ‚Üí Digabung menjadi 3383 rows (duplikat dihapus: 0)
‚úì alun_alun_kota_wisata_batu_reviews_20250725_105504.csv berhasil dimuat (759 rows)
‚úì batu_economis_park_ALL_reviews_20250802_172842.csv berhasil dimuat (1892 rows)
‚úì batu_love_garden_baloga_ALL_reviews_20250802_181709.csv berhasil dimuat (403 rows)
‚úì batu_night_spectacular_ALL_reviews_20250724_154759.csv berhasil dimuat (631 rows)
‚úì batu_rafting_reviews_20250810_224610.csv berhasil dimuat (383 rows)
‚úì coban_putri_reviews_all_20250803_001704.csv berhasil dimuat (323 rows)
‚úì coban_talun_ALL_reviews_20250724_114716.csv berhasil dimuat (1442

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['review_text'] = df_clean['review_text'].apply(clean_review_text)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['reviewer_name'] = df_clean['reviewer_name'].apply(clean_reviewer_name)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean[col] = df_clean[col].fillna('')
A value 

   Sample setelah cleaning:
     1. Seperti tempat liburan pada umumnya, bukanya jam 10 pagi dengan tiket yang bisa dibeli on the spot. ...
     2. Cocok untuk edukasi science terutama untuk anak sekolah. Bisa dicoba alat peraga yang ada di dlm. Be...
     3. Ticket lumayan mahal karena kebetulan kami berkunjung pas high season, @170rb untuk ticket terusan s...
   Reviews dengan Google Translate markers: 0
‚úì Membersihkan nama reviewer...
‚úì Visit time distandardisasi ke 3 kategori + Tidak diketahui
   Distribusi visit_time setelah cleaning:
     'Tidak diketahui': 1209 (75.0%)
     'Akhir pekan': 169 (10.5%)
     'Hari biasa': 156 (9.7%)
     'Hari libur nasional': 79 (4.9%)
‚úì Missing values ditangani: 1199 ‚Üí 0
‚úì Rating distandardisasi (1-5)
‚úì Semua review dipertahankan (tidak ada filter panjang minimum)
   Review kosong: 122
   Review sangat pendek (‚â§5 karakter): 122

Jumlah data setelah cleaning: 1613
Persentase data yang tersisa: 100.0%

Cleaning dataset: Jatim Park 2
J

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['review_text'] = df_clean['review_text'].apply(clean_review_text)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['reviewer_name'] = df_clean['reviewer_name'].apply(clean_reviewer_name)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean[col] = df_clean[col].fillna('')
A value 

   Sample setelah cleaning:
     1. Datang kesini karena menjadi tujuan keluarga saat liburan anak sekolah juli 2025. Banyak hal bisa di...
     2. Wisata Museum Transportasi yang berada di Kota Batu sesuai dengan tema namanya disini menampilkan ko...
     3. Surprisingly, semua unit mobil, kendaraan yang ditawarkan adalah koleksi pribadi yang masih bisa ber...
   Reviews dengan Google Translate markers: 0
‚úì Membersihkan nama reviewer...
‚úì Visit time distandardisasi ke 3 kategori + Tidak diketahui
   Distribusi visit_time setelah cleaning:
     'Tidak diketahui': 3480 (74.3%)
     'Akhir pekan': 471 (10.1%)
     'Hari biasa': 465 (9.9%)
     'Hari libur nasional': 269 (5.7%)
‚úì Missing values ditangani: 3442 ‚Üí 0
‚úì Rating distandardisasi (1-5)
‚úì Semua review dipertahankan (tidak ada filter panjang minimum)
   Review kosong: 1101
   Review sangat pendek (‚â§5 karakter): 1101

Jumlah data setelah cleaning: 4685
Persentase data yang tersisa: 100.0%

Cleaning dataset: Paralayang 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['review_text'] = df_clean['review_text'].apply(clean_review_text)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['reviewer_name'] = df_clean['reviewer_name'].apply(clean_reviewer_name)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean[col] = df_clean[col].fillna('')
A value 

Total wisata: 30
‚úì KONFIRMASI: Total wisata sudah sesuai = 30 wisata

Distribusi review per wisata (Top 10):
  Museum Angkut: 4685 (14.7%)
  Eco Active Park: 3660 (11.5%)
  Air Terjun Coban Rais: 3383 (10.6%)
  Batu Economis Park: 1892 (5.9%)
  Gussari Goa Pinus Batu: 1809 (5.7%)
  Jatim Park 1: 1613 (5.1%)
  Jatim Park 2: 1562 (4.9%)
  Pemandian Air Panas Cangar: 1559 (4.9%)
  Coban Talun: 1442 (4.5%)
  Songgoriti Hot Springs: 997 (3.1%)

Distribusi rating:
  Rating 1: 774 (2.4%)
  Rating 2: 524 (1.6%)
  Rating 3: 1867 (5.9%)
  Rating 4: 6512 (20.5%)
  Rating 5: 22166 (69.6%)

Rata-rata rating keseluruhan: 4.53

Kolom yang tersedia:
['reviewer_name', 'rating', 'date', 'visit_time', 'review_text', 'wisata']

Distribusi waktu kunjungan (4 kategori):
  'Tidak diketahui': 22346 (70.2%)
  'Akhir pekan': 3132 (9.8%)
  'Hari biasa': 2529 (7.9%)
  'Hari libur nasional': 1339 (4.2%)

Validasi kategori visit_time:
  Kategori yang ada: ['Akhir pekan', 'Hari biasa', 'Hari libur nasional', 'Tida