In [2]:
import os
import pandas as pd
import numpy as np
from datetime import datetime
import re

# Tentukan path folder
source_folders = ['hasil scraping', 'hasil scraping rating rendah']
output_folder = 'cleaning_data'

# Cek folder sumber
available_folders = []
for folder in source_folders:
    if os.path.exists(folder):
        available_folders.append(folder)
        print(f"✓ Folder sumber ditemukan: {folder}")
    else:
        print(f"❌ Folder tidak ditemukan: {folder}")

if not available_folders:
    print("❌ Tidak ada folder sumber yang ditemukan!")
    print("Pastikan folder 'hasil scraping' dan/atau 'hasil scraping rating rendah' tersedia.")
    exit()

# Cek dan buat folder output
if not os.path.exists(output_folder):
    os.makedirs(output_folder)
    print(f"✓ Folder '{output_folder}' berhasil dibuat")
else:
    print(f"✓ Folder '{output_folder}' sudah ada")

print(f"✓ Membaca dataset dari folder: {', '.join(available_folders)}")
print(f"✓ Hasil cleaning akan disimpan ke folder: {output_folder}")

# Dictionary untuk menyimpan semua dataset
datasets = {}
# Dictionary untuk tracking proses cleaning
cleaning_summary = {}

print("\n" + "="*60)
print("LOADING DATASETS DARI SEMUA FOLDER SUMBER")
print("="*60)

# File mappings yang diperluas untuk mencakup kedua folder
file_mappings = {
    'coban_rais': [
        'air_terjun_coban_rais_ALL_reviews_20250802_102141.csv',
        'coban_rais_reviews_20250804_223751.csv',
        'air_terjun_coban_rais_reviews_1to3stars_with_text_20250828_135744.csv'
    ],
    'alun_alun': [
        'alun_alun_kota_wisata_batu_reviews_20250725_105504.csv',
        'alun_alun_batu_reviews_1to3stars_with_text_20250831_180701.csv'          
    ],
    'batu_economis_park': [
        'batu_economis_park_ALL_reviews_20250802_172842.csv',
        'batu_economis_park_reviews_1to3stars_with_text_20250828_161649.csv'
    ],
    'batu_love_garden': [
        'batu_love_garden_baloga_ALL_reviews_20250802_181709.csv',
        'batu_love_garden_baloga_reviews_1to3stars_with_text_20250828_204631.csv'
    ],
    'batu_night_spectacular': [
        'batu_night_spectacular_ALL_reviews_20250724_154759.csv',
        'bns_reviews_1to3stars_with_text_20250829_090737.csv'
    ],
    'batu_rafting': [
        'batu_rafting_reviews_20250810_224610.csv',
        'batu_rafting_reviews_1to3stars_with_text_20250829_010450.csv'
    ],
    'coban_putri': ['coban_putri_reviews_all_20250803_001704.csv'],
    'coban_talun': [
        'coban_talun_ALL_reviews_20250724_114716.csv',
        'coban_talun_reviews_1to3stars_with_text_20250831_121410.csv'
    ],
    'desa_wisata_punten': ['desa_wisata_punten_reviews_20250810_230050.csv'],
    'desa_wisata_tulungrejo': ['desa_wisata_tulungrejo_reviews_20250802_013319.csv'],
    'eco_active_park': [
        'eco_active_park_ALL_reviews_20250725_002543.csv',
        'ecopark_tulungrejo_reviews_1to3stars_with_text_20250829_175926.csv'
    ],
    'gussari_goa_pinus_batu': [
        'gussari_goa_pinus_batu_ALL_reviews_20250811_021707.csv',
        'goa_pinus_reviews_1to3stars_with_text_20250829_194218.csv'
    ],
    'gunung_arjuno': ['gunung_arjuno_reviews_20250804_180540.csv'],
    'gunung_panderman': ['gunung_panderman_reviews_20250724_141423.csv'],
    'jatim_park_1': [
        'jatim_park_1_ALL_reviews_20250723_140636.csv',
        'jatim_park1_reviews_1to3stars_with_text_20250829_224028.csv'
    ],
    'jatim_park_2': [
        'jatim_park_2_ALL_reviews_20250725_114708.csv',
        'jatim_park2_reviews_1to3stars_with_text_20250830_010927.csv'
    ],
    'jatim_park_3': [
        'jatim_park_3_ALL_reviews_20250725_011845.csv',
        'jatim_park3_reviews_1to3stars_with_text_20250830_092313.csv'
    ],
    'lumbung_stroberi': ['lumbung_stroberi_ALL_reviews_20250804_160259.csv'],
    'milenial_glow_garden': [
        'milenial_glow_garden_ALL_reviews_20250802_201820.csv',
        'milenial_glow_garden_reviews_1to3stars_with_text_20250830_105812.csv'
    ],
    'museum_angkut': [
        'museum_angkut_ALL_reviews_20250725_160008.csv',
        'museum_angkut_reviews_1to3stars_with_text_20250830_182721.csv'
    ],
    'paralayang_gunung_banyak': ['paralayang_gunung_banyak_reviews_20250802_235154.csv'],
    'pemandian_air_panas_cangar': [
        'pemandian_air_panas_cangar_ALL_reviews_20250802_230623.csv',
        'permandian_air_panas_cangar_reviews_1to3stars_with_text_20250830_213548.csv'
    ],
    'songgoriti_hot_springs': [
        'songgoriti_hot_springs_all_reviews_20250804_192319.csv',
        'songgoriti_hot_spring_reviews_1to3stars_with_text_20250830_220759.csv'
    ],
    'taman_dolan': [
        'taman_dolan_ALL_reviews_20250804_174157.csv',
        'taman_dolan_reviews_1to3stars_with_text_20250831_020414.csv'
    ],
    'taman_pinus_campervan': ['taman_pinus_campervan_ALL_reviews_20250804_160757.csv'],
    'taman_selecta': [
        'taman_selecta_ALL_reviews_20250724_085149.csv',
        'taman_selecta_reviews_1to3stars_with_text_20250831_104640.csv'
    ],
    'tirta_nirwana_hotspring': [
        'tirta_nirwana_songgoriti_all_reviews_20250803_011926.csv',
        'tirta_nirwana_songgoriti_all_reviews_newest_20250804_213237.csv'
    ],
    'wisata_bunga_sidomulyo': [
        'wisata_bunga_sidomulyo_reviews_20250802_174108.csv',
        'wisata_bunga_sidomulyo_reviews_20250803_012801.csv',
        'rest_area_desa_wisata_sidomulyo_reviews_20250802_174921.csv'
    ],
    'wisata_desa_agro_bumiaji': ['wisata_desa_agro_bumiaji_reviews_20250804_150849.csv'],
    'wisata_petik_apel_mandiri': ['wisata_petik_apel_mandiri_reviews_20250804_181035.csv']
}

def find_file_in_folders(filename, folders):
    """Mencari file di semua folder yang tersedia"""
    for folder in folders:
        file_path = os.path.join(folder, filename)
        if os.path.exists(file_path):
            return file_path, folder
    return None, None

# Load dan gabungkan dataset dari semua folder
for key, filenames in file_mappings.items():
    combined_data = []
    files_loaded = 0
    files_info = []
    
    for filename in filenames:
        file_path, found_folder = find_file_in_folders(filename, available_folders)
        
        if file_path:
            try:
                df = pd.read_csv(file_path)
                combined_data.append(df)
                files_loaded += 1
                files_info.append(f"{filename} ({len(df)} rows) - from '{found_folder}'")
                print(f"✓ {filename} berhasil dimuat ({len(df)} rows) dari '{found_folder}'")
            except Exception as e:
                files_info.append(f"{filename} (ERROR: {str(e)}) - from '{found_folder}'")
                print(f"❌ Error loading {filename}: {e}")
        else:
            files_info.append(f"{filename} (FILE NOT FOUND)")
            print(f"❌ File tidak ditemukan di semua folder: {filename}")
    
    # Gabungkan data jika ada file yang berhasil dimuat
    if combined_data:
        if len(combined_data) == 1:
            datasets[key] = combined_data[0]
            cleaning_summary[key] = {
                'files': files_info,
                'files_loaded': files_loaded,
                'original_rows': len(combined_data[0]),
                'status': 'SUCCESS'
            }
        else:
            # Gabungkan multiple files dengan penanganan duplikat yang lebih baik
            combined_df = pd.concat(combined_data, ignore_index=True)
            original_combined = len(combined_df)
            
            # Hapus duplikat berdasarkan multiple kolom untuk akurasi lebih baik
            duplicate_columns = []
            if 'reviewer_name' in combined_df.columns:
                duplicate_columns.append('reviewer_name')
            if 'review_text' in combined_df.columns:
                duplicate_columns.append('review_text')
            if 'rating' in combined_df.columns:
                duplicate_columns.append('rating')
            if 'date' in combined_df.columns:
                duplicate_columns.append('date')
                
            if duplicate_columns:
                combined_df = combined_df.drop_duplicates(subset=duplicate_columns, keep='first')
            else:
                combined_df = combined_df.drop_duplicates()
                
            datasets[key] = combined_df
            cleaning_summary[key] = {
                'files': files_info,
                'files_loaded': files_loaded,
                'original_rows': original_combined,
                'after_dedup_rows': len(combined_df),
                'duplicates_removed': original_combined - len(combined_df),
                'status': 'SUCCESS (MERGED)'
            }
            print(f"  → Digabung menjadi {len(combined_df)} rows (duplikat dihapus: {original_combined - len(combined_df)})")
    else:
        cleaning_summary[key] = {
            'files': files_info,
            'files_loaded': 0,
            'original_rows': 0,
            'status': 'FAILED'
        }

if len(datasets) == 0:
    print("❌ Tidak ada dataset yang berhasil dimuat!")
    exit()

print(f"\n✓ Total dataset berhasil dimuat: {len(datasets)}")

# FUNGSI YANG DIPERBAIKI: Menghapus emoji dengan lebih komprehensif
def remove_emojis(text):
    """
    Menghapus emoji dari teks dengan lebih komprehensif
    """
    if pd.isna(text) or text == '':
        return ''
    
    text = str(text)
    
    # Daftar emoji dan karakter khusus yang akan dihapus
    specific_emojis= [
        '🅰️', ' ', '🅰', '🅱️', '🅱', '🅾️', '🅾', 
        '🆎', '🆑', '🆒', '🆓', '🆔', '🆕', '🆖', '🆗', '🆘', '🆙', '🆚',
        '🔤', '🔡', '🔢', '🔣',
        '📳', '📴', '📵', '📶', '📷', '📸', '📹', '📺', '📻', '📼',
        '⭐', '⭐️', '✨', '✅', '❌', '❎', '⚠️', '⚠', '⛔', '🚫',
        '💯', '💢', '💥', '💫', '💬', '💭', '💮', '💰', '💱', '💲',
        '🎯', '🎰', '🎱', '🎲', '🎳', '🎴', '🎵', '🎶', '🎷', '🎸', '🎹',
        '🏆', '🏅', '🏈', '🏉', '🏊', '🏋', '🏌', '🏍', '🏎', '🏏',
        '👍', '👎', '👌', '👏', '👀', '👁', '👂', '👃', '👄', '👅',
        '😀', '😁', '😂', '😃', '😄', '😅', '😆', '😇', '😈', '😉',
        '😊', '😋', '😌', '😍', '😎', '😏', '😐', '😑', '😒', '😓',
        '😔', '😕', '😖', '😗', '😘', '😙', '😚', '😛', '😜', '😝',
        '😞', '😟', '😠', '😡', '😢', '😣', '😤', '😥', '😦', '😧',
        '😨', '😩', '😪', '😫', '😬', '😭', '😮', '😯', '😰', '😱',
        '😲', '😳', '😴', '😵', '😶', '😷', '😸', '😹', '😺', '😻',
        '😼', '😽', '😾', '😿', '🙀', '🙁', '🙂', '🙃', '🙄', '🙅',
        '🙆', '🙇', '🙈', '🙉', '🙊', '🙋', '🙌', '🙍', '🙎', '🙏',
        ' 1 ',' 2 ', ' 3 ', ' 4 ', ' 5 ', ' 6 ', ' 7 ', 
        ' 8 ', ' 9 ', ' 10 ', ' 13 ',' 24 ', ' 32 '
    ]
    
    # Hapus emoji spesifik
    for emoji in specific_emojis:
        text = text.replace(emoji, '')
    
    # Pattern Unicode untuk menghapus berbagai jenis emoji
    emoji_patterns = [
        r'[\U0001F600-\U0001F64F]',  # Emoticons
        r'[\U0001F300-\U0001F5FF]',  # Symbols & Pictographs
        r'[\U0001F680-\U0001F6FF]',  # Transport & Map Symbols
        r'[\U0001F1E0-\U0001F1FF]',  # Flags
        r'[\U00002600-\U000026FF]',  # Miscellaneous Symbols
        r'[\U00002700-\U000027BF]',  # Dingbats
        r'[\U0001F100-\U0001F1FF]',  # Enclosed Alphanumeric Supplement
        r'[\U0001F200-\U0001F2FF]',  # Enclosed Ideographic Supplement
        r'[\U0001F900-\U0001F9FF]',  # Supplemental Symbols and Pictographs
        r'[\U0001FA70-\U0001FAFF]',  # Symbols and Pictographs Extended-A
        r'[\U00002190-\U000021FF]',  # Arrows
        r'[\U00002B00-\U00002BFF]',  # Miscellaneous Symbols and Arrows
        r'[\U0000FE00-\U0000FE0F]',  # Variation Selectors
        r'[\U0001F1E6-\U0001F1FF]',  # Regional Indicator Symbols
        r'[\U0001F3FB-\U0001F3FF]',  # Skin tone modifiers
    ]
    
    for pattern in emoji_patterns:
        text = re.sub(pattern, '', text)
    
    # Hapus invisible characters
    invisible_chars = ['\u200b', '\u200c', '\u200d', '\u2060', '\ufeff', '\u00ad']
    for char in invisible_chars:
        text = text.replace(char, '')
    
    # Clean up spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# FUNGSI YANG DIPERBAIKI: Membersihkan semua variasi "Waktu antrean"
def clean_review_text(text):
    """
    Membersihkan teks review dari semua noise termasuk semua variasi "Waktu antrean"
    """
    if pd.isna(text) or text == '':
        return ''
    
    text = str(text)
    
    # LANGKAH 1: Hapus emoji terlebih dahulu
    text = remove_emojis(text)
    
    # LANGKAH 2: Hapus SEMUA variasi "Waktu antrean" yang disebutkan
    waktu_antrean_patterns = [
        # Pola spesifik yang disebutkan - urutan dari yang paling spesifik
        r'Waktu antrean\s+Maks\s+10\s+mnt\s+Sebaiknya buat reservasi\s+Tidak yakin',
        r'Waktu antrean\s+30-60\s+mnt\s+Sebaiknya buat reservasi\s+Tidak yakin',
        r'Waktu antrean\s+Tanpa mengantre\s+Sebaiknya buat reservasi',
        r'Waktu antrean\s+Maks\s+10\s+mnt\s+Sebaiknya buat reservasi',
        r'Waktu antrean\s+Maks\s+10\s+mnt',
        r'Waktu antrean\s+Tanpa mengantre',
        r'Waktu antrean\s+1\s+jam\+',
        r'Waktu antrean\s+10-30\s+mnt',
        r'Waktu antrean\s+30-60\s+mnt',
        
        # Pola umum "Waktu antrean" dengan berbagai kombinasi
        r'Waktu antrean\s+[^\.]*?(?=\.|$)',
        r'Queue time\s+[^\.]*?(?=\.|$)',
        
        # Pola individual yang mungkin tersisa
        r'\bWaktu antrean\b\s*',
        r'\bTanpa mengantre\b\s*',
        r'\bSebaiknya buat reservasi\b\s*',
        r'\bTidak yakin\b\s*',
        r'\bQueue time\b\s*',
        r'\bNo queue\b\s*',
        r'\bShould make reservation\b\s*',
        r'\bNot sure\b\s*',
        r'\bWait time\b\s*',
        r'\bWaiting time\b\s*',
        
        # Pola waktu antrean dengan angka
        r'\b\d+\s+jam\+?\b\s*',
        r'\b\d+-\d+\s+mnt\b\s*',
        r'\bMaks\s+\d+\s+mnt\b\s*',
        r'\bMax\s+\d+\s+min\b\s*',
    ]
    
    # Apply semua pattern waktu antrean
    for pattern in waktu_antrean_patterns:
        text = re.sub(pattern, '', text, flags=re.IGNORECASE)
    
    # LANGKAH 3: Hapus Google Translate markers
    translate_patterns = [
        r'Diterjemahkan oleh Google\s*・?\s*',
        r'Lihat versi asli\s*\([^)]*\)\s*',
        r'Translated by Google\s*・?\s*',
        r'See original\s*\([^)]*\)\s*',
        r'Terjemahan Google\s*',
        r'Auto-translated\s*',
        r'・\s*',
    ]
    
    for pattern in translate_patterns:
        text = re.sub(pattern, '', text, flags=re.IGNORECASE)
    
    # LANGKAH 4: Hapus visit time tanpa konteks yang tepat
    def clean_visit_time_without_context(text):
        time_indicators = [
            'pada', 'saat', 'ketika', 'waktu', 'di', 'selama', 'berkunjung',
            'datang', 'pergi', 'kunjungan', 'ramai', 'sepi', 'penuh', 'padat'
        ]
        
        visit_time_pattern = r'\b(akhir pekan|hari biasa|hari libur nasional)\b'
        matches = list(re.finditer(visit_time_pattern, text, flags=re.IGNORECASE))
        
        for match in reversed(matches):
            start_pos = match.start()
            end_pos = match.end()
            before_text = text[max(0, start_pos-30):start_pos].lower()
            
            has_time_context = any(indicator in before_text for indicator in time_indicators)
            
            if not has_time_context:
                is_at_end = end_pos >= len(text) - 2
                after_punctuation = start_pos > 0 and text[start_pos-1] in '.,;:!?'
                
                if is_at_end or after_punctuation:
                    text = text[:start_pos] + text[end_pos:]
        
        return text
    
    text = clean_visit_time_without_context(text)
    
    # LANGKAH 5: Clean up formatting
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\n+', ' ', text)
    text = re.sub(r'[,\.\s]+$', '.', text)
    text = re.sub(r'^[,\.\s]+', '', text)
    text = text.strip()
    
    if re.match(r'^[^\w]*$', text):
        return ''
    
    return text

def clean_reviewer_name(name):
    """
    Membersihkan nama reviewer
    """
    if pd.isna(name) or name == '':
        return ''
    
    name = str(name).strip()
    name = remove_emojis(name)
    
    patterns = [
        r'Diterjemahkan oleh Google\s*',
        r'Translated by Google\s*',
        r'・\s*',
    ]
    
    for pattern in patterns:
        name = re.sub(pattern, '', name, flags=re.IGNORECASE)
    
    name = name.strip()
    
    if len(name) < 2 or re.match(r'^[^\w]*$', name):
        return ''
    
    return name

def detect_outliers(df, column, method='iqr'):
    """
    Deteksi outlier menggunakan IQR atau Z-score
    """
    if column not in df.columns or df[column].dtype not in ['int64', 'float64']:
        return pd.Series([False] * len(df))
    
    if method == 'iqr':
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        return (df[column] < lower_bound) | (df[column] > upper_bound)
    
    elif method == 'zscore':
        z_scores = np.abs((df[column] - df[column].mean()) / df[column].std())
        return z_scores > 3
    
    return pd.Series([False] * len(df))

# Fungsi cleaning yang diperbaiki
def clean_dataset(df, dataset_name, dataset_key):
    """
    Fungsi untuk membersihkan dataset dengan penanganan duplikat dan outlier yang lebih baik
    """
    print(f"\n{'='*50}")
    print(f"Cleaning dataset: {dataset_name}")
    print(f"{'='*50}")
    
    print(f"Jumlah data awal: {len(df)}")
    print(f"Kolom: {list(df.columns)}")
    
    cleaning_summary[dataset_key]['columns'] = list(df.columns)
    cleaning_summary[dataset_key]['cleaning_start_rows'] = len(df)
    
    # 1. Hapus duplikat dengan metode yang lebih ketat
    print("✓ Menghapus duplikat...")
    
    # Identifikasi kolom untuk pengecekan duplikat
    duplicate_columns = []
    if 'reviewer_name' in df.columns:
        duplicate_columns.append('reviewer_name')
    if 'review_text' in df.columns:
        duplicate_columns.append('review_text')
    if 'rating' in df.columns:
        duplicate_columns.append('rating')
    
    initial_rows = len(df)
    
    if duplicate_columns:
        # Hapus duplikat berdasarkan kombinasi kolom penting
        df_clean = df.drop_duplicates(subset=duplicate_columns, keep='first')
        # Juga hapus duplikat exact
        df_clean = df_clean.drop_duplicates()
    else:
        df_clean = df.drop_duplicates()
    
    duplicates_removed = initial_rows - len(df_clean)
    if duplicates_removed > 0:
        print(f"   Duplikat dihapus: {duplicates_removed}")
    
    cleaning_summary[dataset_key]['duplicates_in_cleaning'] = duplicates_removed
    
    # 2. Deteksi dan hapus outlier
    print("✓ Mendeteksi outlier...")
    outliers_removed = 0
    
    # Outlier berdasarkan panjang review yang ekstrem
    if 'review_text' in df_clean.columns:
        review_lengths = df_clean['review_text'].astype(str).str.len()
        
        # Hapus review yang sangat panjang (kemungkinan spam atau error)
        very_long_reviews = review_lengths > 5000
        very_long_count = very_long_reviews.sum()
        
        if very_long_count > 0:
            df_clean = df_clean[~very_long_reviews]
            outliers_removed += very_long_count
            print(f"   Review sangat panjang (>5000 char) dihapus: {very_long_count}")
    
    # Outlier berdasarkan rating (jika ada rating di luar 1-5)
    if 'rating' in df_clean.columns:
        invalid_ratings = ~df_clean['rating'].between(1, 5)
        invalid_count = invalid_ratings.sum()
        
        if invalid_count > 0:
            df_clean = df_clean[~invalid_ratings]
            outliers_removed += invalid_count
            print(f"   Rating invalid (bukan 1-5) dihapus: {invalid_count}")
    
    cleaning_summary[dataset_key]['outliers_removed'] = outliers_removed
    
    # 3. Clean review text dengan penghapusan "Waktu antrean" yang komprehensif
    print("✓ Membersihkan teks review dari semua noise...")
    
    if 'review_text' in df_clean.columns:
        # Hitung berbagai jenis noise sebelum cleaning
        before_cleaning = df_clean['review_text'].astype(str)
        
        # Count berbagai jenis noise
        google_translate_count = before_cleaning.str.contains('Diterjemahkan oleh Google|Translated by Google', case=False, na=False).sum()
        
        # Count semua variasi "Waktu antrean"
        waktu_antrean_patterns = [
            r'Waktu antrean',
            r'Queue time',
            r'Tanpa mengantre',
            r'Sebaiknya buat reservasi',
            r'Maks \d+ mnt',
            r'\d+-\d+ mnt',
            r'\d+ jam\+?'
        ]
        waktu_antrean_count = 0
        for pattern in waktu_antrean_patterns:
            waktu_antrean_count += before_cleaning.str.contains(pattern, case=False, na=False).sum()
        
        emoji_count = before_cleaning.str.contains(r'[\U0001F600-\U0001F64F]|[\U0001F300-\U0001F5FF]|🅰️|🅱️|⭐|✨|✅|❌|👍|👎', case=False, na=False).sum()
        
        # Apply cleaning
        df_clean['review_text'] = df_clean['review_text'].apply(clean_review_text)
        
        print(f"   Google Translate markers dibersihkan: {google_translate_count}")
        print(f"   Waktu antrean patterns dibersihkan: {waktu_antrean_count}")
        print(f"   Emoji dibersihkan: {emoji_count}")
        
        cleaning_summary[dataset_key]['google_translate_cleaned'] = google_translate_count
        cleaning_summary[dataset_key]['waktu_antrean_cleaned'] = waktu_antrean_count
        cleaning_summary[dataset_key]['emojis_cleaned'] = emoji_count
    
    # 4. Clean reviewer names
    if 'reviewer_name' in df_clean.columns:
        print("✓ Membersihkan nama reviewer...")
        df_clean['reviewer_name'] = df_clean['reviewer_name'].apply(clean_reviewer_name)
    
    # 5. Standarisasi visit_time ke 4 kategori saja
    if 'visit_time' in df_clean.columns:
        print("✓ Standardisasi visit_time...")
        
        def clean_visit_time(visit_time):
            if pd.isna(visit_time) or visit_time == '':
                return 'Tidak diketahui'
            
            visit_time_str = str(visit_time).strip().lower()
            visit_time_str = remove_emojis(visit_time_str)
            visit_time_str = re.sub(r'diterjemahkan oleh google\s*・?\s*', '', visit_time_str)
            visit_time_str = visit_time_str.strip()
            
            if visit_time_str in ['hari biasa', 'weekday', 'weekdays']:
                return 'Hari biasa'
            elif visit_time_str in ['akhir pekan', 'weekend', 'akhir p', 'weekends']:
                return 'Akhir pekan'
            elif visit_time_str in ['hari libur nasional', 'libur nasional', 'public holiday', 'national holiday']:
                return 'Hari libur nasional'
            else:
                return 'Tidak diketahui'
        
        df_clean['visit_time'] = df_clean['visit_time'].apply(clean_visit_time)
        
        visit_time_dist = df_clean['visit_time'].value_counts()
        print(f"   Distribusi visit_time:")
        for visit_type, count in visit_time_dist.items():
            percentage = (count / len(df_clean)) * 100
            print(f"     '{visit_type}': {count} ({percentage:.1f}%)")
    
    # 6. Standarisasi rating
    if 'rating' in df_clean.columns:
        df_clean['rating'] = pd.to_numeric(df_clean['rating'], errors='coerce')
        df_clean['rating'] = df_clean['rating'].fillna(3).astype(int)  # Default ke 3 jika missing
        df_clean['rating'] = df_clean['rating'].clip(1, 5)
        print(f"✓ Rating distandardisasi (1-5)")
    
    # 7. Handle missing values
    text_columns = ['reviewer_name', 'review_text']
    for col in text_columns:
        if col in df_clean.columns:
            df_clean[col] = df_clean[col].fillna('')
    
    if 'visit_time' in df_clean.columns:
        df_clean['visit_time'] = df_clean['visit_time'].fillna('Tidak diketahui')
    
    # 8. Tambahkan kolom wisata
    df_clean['wisata'] = dataset_name
    
    # Update summary
    cleaning_summary[dataset_key]['final_rows'] = len(df_clean)
    cleaning_summary[dataset_key]['retention_rate'] = (len(df_clean) / cleaning_summary[dataset_key]['cleaning_start_rows']) * 100
    
    print(f"\nJumlah data setelah cleaning: {len(df_clean)}")
    print(f"Retention rate: {cleaning_summary[dataset_key]['retention_rate']:.1f}%")
    
    return df_clean

# Mapping nama wisata
wisata_names = {
    'alun_alun': 'Alun Alun Kota Wisata Batu',
    'batu_economis_park': 'Batu Economis Park',
    'batu_love_garden': 'Batu Love Garden (Baloga)',
    'batu_night_spectacular': 'Batu Night Spectacular',
    'batu_rafting': 'Batu Rafting',
    'coban_putri': 'Coban Putri',
    'coban_rais': 'Air Terjun Coban Rais',
    'coban_talun': 'Coban Talun',
    'desa_wisata_punten': 'Desa Wisata Punten',
    'desa_wisata_tulungrejo': 'Desa Wisata Tulungrejo',
    'eco_active_park': 'Eco Active Park',
    'gussari_goa_pinus_batu': 'Gussari Goa Pinus Batu',
    'gunung_arjuno': 'Gunung Arjuno',
    'gunung_panderman': 'Gunung Panderman',
    'jatim_park_1': 'Jatim Park 1',
    'jatim_park_2': 'Jatim Park 2',
    'jatim_park_3': 'Jatim Park 3',
    'lumbung_stroberi': 'Lumbung Stroberi',
    'milenial_glow_garden': 'Milenial Glow Garden',
    'museum_angkut': 'Museum Angkut',
    'paralayang_gunung_banyak': 'Paralayang Gunung Banyak',
    'pemandian_air_panas_cangar': 'Pemandian Air Panas Cangar',
    'songgoriti_hot_springs': 'Songgoriti Hot Springs',
    'taman_dolan': 'Taman Dolan',
    'taman_pinus_campervan': 'Taman Pinus Campervan',
    'taman_selecta': 'Taman Selecta',
    'tirta_nirwana_hotspring': 'Tirta Nirwana Hotspring',
    'wisata_bunga_sidomulyo': 'Wisata Bunga Sidomulyo',
    'wisata_desa_agro_bumiaji': 'Wisata Desa Agro Bumiaji',
    'wisata_petik_apel_mandiri': 'Wisata Petik Apel Mandiri'
}

print("\n" + "="*60)
print("PROSES CLEANING DATASETS")
print("="*60)

# Clean semua dataset
cleaned_datasets = {}
for key, df in datasets.items():
    cleaned_df = clean_dataset(df, wisata_names[key], key)
    cleaned_datasets[key] = cleaned_df

# Gabungkan semua dataset
if cleaned_datasets:
    all_reviews = pd.concat(cleaned_datasets.values(), ignore_index=True)
    
    # Final duplicate removal pada dataset gabungan
    print(f"\n✓ Dataset gabungan sebelum final dedup: {len(all_reviews)}")
    
    # Hapus duplikat final berdasarkan kombinasi kolom penting
    duplicate_columns = ['reviewer_name', 'review_text', 'rating', 'wisata']
    available_columns = [col for col in duplicate_columns if col in all_reviews.columns]
    
    if available_columns:
        before_final_dedup = len(all_reviews)
        all_reviews = all_reviews.drop_duplicates(subset=available_columns, keep='first')
        final_duplicates_removed = before_final_dedup - len(all_reviews)
        if final_duplicates_removed > 0:
            print(f"✓ Final duplicates removed: {final_duplicates_removed}")
    
    print("\n" + "="*60)
    print("RINGKASAN DATASET GABUNGAN")
    print("="*60)
    print(f"Total review: {len(all_reviews):,}")
    print(f"Total wisata: {len(all_reviews['wisata'].unique())}")
    print(f"Source folders: {', '.join(available_folders)}")
    
    if len(all_reviews['wisata'].unique()) == 30:
        print("✓ KONFIRMASI: Total wisata = 30 wisata")
    else:
        print(f"⚠ PERINGATAN: Total wisata = {len(all_reviews['wisata'].unique())} (target: 30)")
        print("   Daftar wisata yang ada:")
        for i, wisata in enumerate(sorted(all_reviews['wisata'].unique()), 1):
            print(f"     {i:2d}. {wisata}")
    
    # Verifikasi cleaning "Waktu antrean"
    remaining_waktu_antrean = all_reviews['review_text'].str.contains('Waktu antrean|Queue time', case=False, na=False).sum()
    print(f"✓ 'Waktu antrean' tersisa: {remaining_waktu_antrean}")
    
    remaining_emojis = all_reviews['review_text'].str.contains(r'[\U0001F600-\U0001F64F]|🅰️|🅱️|⭐|👍|👎', case=False, na=False).sum()
    print(f"✓ Emoji tersisa: {remaining_emojis}")
    
    print(f"\nDistribusi rating:")
    rating_dist = all_reviews['rating'].value_counts().sort_index()
    for rating, count in rating_dist.items():
        percentage = (count / len(all_reviews)) * 100
        print(f"  Rating {rating}: {count:,} ({percentage:.1f}%)")
    
    print(f"\nRata-rata rating: {all_reviews['rating'].mean():.2f}")
    
    if 'visit_time' in all_reviews.columns:
        print(f"\nDistribusi waktu kunjungan:")
        visit_dist = all_reviews['visit_time'].value_counts()
        for visit_type, count in visit_dist.items():
            percentage = (count / len(all_reviews)) * 100
            print(f"  '{visit_type}': {count:,} ({percentage:.1f}%)")
    
    # Simpan dataset
    print("\n" + "="*60)
    print("MENYIMPAN DATASET")
    print("="*60)
    
    combined_filename = os.path.join(output_folder, 'combined_batu_tourism_reviews_cleaned.csv')
    all_reviews.to_csv(combined_filename, index=False, encoding='utf-8')
    print(f"✓ Dataset gabungan disimpan: {combined_filename}")
    
    # Buat laporan komprehensif
    def create_comprehensive_report(df, cleaning_summary, source_folders):
        report = []
        report.append("="*80)
        report.append("LAPORAN CLEANING DATA - BATU TOURISM REVIEWS")
        report.append("="*80)
        report.append(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
        report.append(f"Source folders: {', '.join(source_folders)}")
        report.append(f"Total reviews: {len(df):,}")
        report.append(f"Total wisata: {len(df['wisata'].unique())}")
        
        # Summary statistik cleaning
        total_original = sum(s.get('original_rows', 0) for s in cleaning_summary.values() if s.get('status', '').startswith('SUCCESS'))
        total_final = len(df)
        total_duplicates = sum(s.get('duplicates_in_cleaning', 0) for s in cleaning_summary.values())
        total_outliers = sum(s.get('outliers_removed', 0) for s in cleaning_summary.values())
        total_waktu_antrean = sum(s.get('waktu_antrean_cleaned', 0) for s in cleaning_summary.values())
        total_emojis = sum(s.get('emojis_cleaned', 0) for s in cleaning_summary.values())
        
        report.append(f"\nRINGKASAN CLEANING:")
        report.append(f"✓ Total rows awal: {total_original:,}")
        report.append(f"✓ Total rows akhir: {total_final:,}")
        report.append(f"✓ Retention rate: {(total_final/total_original*100):.1f}%")
        report.append(f"✓ Duplikat dihapus: {total_duplicates:,}")
        report.append(f"✓ Outlier dihapus: {total_outliers:,}")
        report.append(f"✓ 'Waktu antrean' patterns dibersihkan: {total_waktu_antrean:,}")
        report.append(f"✓ Emoji dibersihkan: {total_emojis:,}")
        
        # Verifikasi hasil cleaning
        remaining_waktu_antrean = df['review_text'].str.contains('Waktu antrean|Queue time', case=False, na=False).sum()
        remaining_emojis = df['review_text'].str.contains(r'[\U0001F600-\U0001F64F]|🅰️|🅱️|⭐|👍|👎', case=False, na=False).sum()
        
        report.append(f"\nVERIFIKASI HASIL CLEANING:")
        report.append(f"✓ 'Waktu antrean' tersisa: {remaining_waktu_antrean:,}")
        report.append(f"✓ Emoji tersisa: {remaining_emojis:,}")
        
        # Detail file sources per wisata
        report.append(f"\nDETAIL SUMBER FILE PER WISATA:")
        report.append("-" * 80)
        
        for key, summary in cleaning_summary.items():
            if summary['status'].startswith('SUCCESS'):
                wisata_name = wisata_names.get(key, key)
                report.append(f"\n{wisata_name}:")
                report.append(f"  Status: {summary['status']}")
                report.append(f"  Files loaded: {summary['files_loaded']}")
                for file_info in summary['files']:
                    report.append(f"    - {file_info}")
                report.append(f"  Final rows: {summary.get('final_rows', 0):,}")
        
        # Distribusi data
        report.append(f"\nDISTRIBUSI DATA:")
        report.append(f"Rata-rata rating: {df['rating'].mean():.2f}")
        report.append(f"Rata-rata panjang review: {df['review_text'].str.len().mean():.0f} karakter")
        
        rating_dist = df['rating'].value_counts().sort_index()
        report.append(f"\nDistribusi Rating:")
        for rating, count in rating_dist.items():
            percentage = (count / len(df)) * 100
            report.append(f"  Rating {rating}: {count:,} ({percentage:.1f}%)")
        
        if 'visit_time' in df.columns:
            visit_dist = df['visit_time'].value_counts()
            report.append(f"\nDistribusi Waktu Kunjungan:")
            for visit_type, count in visit_dist.items():
                percentage = (count / len(df)) * 100
                report.append(f"  '{visit_type}': {count:,} ({percentage:.1f}%)")
        
        # Top wisata
        top_wisata = df['wisata'].value_counts().head(10)
        report.append(f"\nTOP 10 WISATA (berdasarkan jumlah review):")
        for i, (wisata, count) in enumerate(top_wisata.items(), 1):
            percentage = (count / len(df)) * 100
            avg_rating = df[df['wisata'] == wisata]['rating'].mean()
            report.append(f"  {i:2d}. {wisata}: {count:,} reviews ({percentage:.1f}%) - Rating: {avg_rating:.2f}")
        
        # Detail per wisata
        report.append(f"\nDETAIL STATISTIK PER WISATA:")
        report.append("-" * 80)
        
        wisata_stats = df.groupby('wisata').agg({
            'rating': ['count', 'mean'],
            'review_text': lambda x: x.str.len().mean()
        }).round(2)
        
        wisata_stats.columns = ['review_count', 'avg_rating', 'avg_length']
        wisata_stats = wisata_stats.sort_values('review_count', ascending=False)
        
        for i, (wisata, stats) in enumerate(wisata_stats.iterrows(), 1):
            report.append(f"{i:2d}. {wisata}")
            report.append(f"    Reviews: {int(stats['review_count']):,}")
            report.append(f"    Avg Rating: {stats['avg_rating']:.2f}")
            report.append(f"    Avg Review Length: {stats['avg_length']:.0f} chars")
        
        return '\n'.join(report)
    
    # Generate dan simpan laporan
    comprehensive_report = create_comprehensive_report(all_reviews, cleaning_summary, available_folders)
    
    report_filename = os.path.join(output_folder, 'comprehensive_cleaning_report.txt')
    with open(report_filename, 'w', encoding='utf-8') as f:
        f.write(comprehensive_report)
    print(f"✓ Laporan komprehensif disimpan: {report_filename}")
    
    # Simpan summary statistics
    summary_data = []
    for wisata in sorted(all_reviews['wisata'].unique()):
        wisata_data = all_reviews[all_reviews['wisata'] == wisata]
        
        summary_row = {
            'Wisata': wisata,
            'Total_Reviews': len(wisata_data),
            'Avg_Rating': round(wisata_data['rating'].mean(), 2),
            'Avg_Review_Length': round(wisata_data['review_text'].str.len().mean(), 0),
            'Min_Rating': wisata_data['rating'].min(),
            'Max_Rating': wisata_data['rating'].max(),
        }
        
        # Rating distribution
        rating_counts = wisata_data['rating'].value_counts()
        for rating in [1, 2, 3, 4, 5]:
            summary_row[f'Rating_{rating}'] = rating_counts.get(rating, 0)
        
        # Visit time distribution
        if 'visit_time' in wisata_data.columns:
            visit_counts = wisata_data['visit_time'].value_counts()
            summary_row['Hari_Biasa'] = visit_counts.get('Hari biasa', 0)
            summary_row['Akhir_Pekan'] = visit_counts.get('Akhir pekan', 0)
            summary_row['Hari_Libur_Nasional'] = visit_counts.get('Hari libur nasional', 0)
            summary_row['Tidak_Diketahui'] = visit_counts.get('Tidak diketahui', 0)
        
        summary_data.append(summary_row)
    
    summary_stats = pd.DataFrame(summary_data)
    summary_stats = summary_stats.sort_values('Total_Reviews', ascending=False)
    
    summary_filename = os.path.join(output_folder, 'wisata_summary_statistics.csv')
    summary_stats.to_csv(summary_filename, index=False, encoding='utf-8')
    print(f"✓ Summary statistik disimpan: {summary_filename}")
    
    print(f"\n" + "="*60)
    print("CLEANING SELESAI")
    print("="*60)
    print(f"✅ BERHASIL: {len(all_reviews):,} reviews dari {len(all_reviews['wisata'].unique())} wisata")
    print(f"✅ SOURCE: Data dari {len(available_folders)} folder ({', '.join(available_folders)})")
    print(f"✅ DUPLIKAT: Dihapus dengan metode yang lebih ketat")
    print(f"✅ OUTLIER: Dihapus berdasarkan panjang review dan rating invalid")
    print(f"✅ WAKTU ANTREAN: Semua variasi berhasil dihapus")
    print(f"✅ EMOJI: Dibersihkan dari review dan nama")
    print(f"✅ VISIT TIME: Distandardisasi ke 4 kategori")
    print(f"✅ FILES: Tersimpan di folder '{output_folder}'")
    
    # Tampilkan breakdown sumber data
    print(f"\n✓ BREAKDOWN SUMBER DATA:")
    files_from_folders = {}
    for key, summary in cleaning_summary.items():
        if summary['status'].startswith('SUCCESS'):
            for file_info in summary['files']:
                if 'from' in file_info:
                    folder_name = file_info.split("from '")[1].split("'")[0]
                    if folder_name not in files_from_folders:
                        files_from_folders[folder_name] = 0
                    files_from_folders[folder_name] += 1
    
    for folder, count in files_from_folders.items():
        print(f"   {folder}: {count} files")
    
else:
    print("❌ Tidak ada dataset yang berhasil dibersihkan!")

✓ Folder sumber ditemukan: hasil scraping
✓ Folder sumber ditemukan: hasil scraping rating rendah
✓ Folder 'cleaning_data' sudah ada
✓ Membaca dataset dari folder: hasil scraping, hasil scraping rating rendah
✓ Hasil cleaning akan disimpan ke folder: cleaning_data

LOADING DATASETS DARI SEMUA FOLDER SUMBER
✓ air_terjun_coban_rais_ALL_reviews_20250802_102141.csv berhasil dimuat (3358 rows) dari 'hasil scraping'
✓ coban_rais_reviews_20250804_223751.csv berhasil dimuat (25 rows) dari 'hasil scraping'
✓ air_terjun_coban_rais_reviews_1to3stars_with_text_20250828_135744.csv berhasil dimuat (200 rows) dari 'hasil scraping rating rendah'
  → Digabung menjadi 3430 rows (duplikat dihapus: 153)
✓ alun_alun_kota_wisata_batu_reviews_20250725_105504.csv berhasil dimuat (759 rows) dari 'hasil scraping'
✓ alun_alun_batu_reviews_1to3stars_with_text_20250831_180701.csv berhasil dimuat (278 rows) dari 'hasil scraping rating rendah'
  → Digabung menjadi 1037 rows (duplikat dihapus: 0)
✓ batu_economis_park