In [9]:
import pandas as pd
import re
from collections import defaultdict
import ipywidgets as widgets
from IPython.display import display, HTML, clear_output
import io

class BooleanRetrievalSystem:
    def __init__(self):
        self.dataset = []
        self.inverted_index = defaultdict(set)
        self.universal_set = set()
        self.stopwords = {
            'yang', 'di', 'ke', 'dari', 'dan', 'atau', 'untuk', 'dengan', 'pada', 'dalam',
            'adalah', 'akan', 'telah', 'sudah', 'dapat', 'bisa', 'ini', 'itu', 'ada',
            'tidak', 'juga', 'lebih', 'saat', 'serta', 'antara', 'oleh', 'atas', 'bawah',
            'sebagai', 'karena', 'jika', 'maka', 'ketika', 'dimana', 'bagaimana', 'mengapa'
        }
        
    def tokenize(self, text):
        """Tokenisasi dan preprocessing teks"""
        if not text:
            return []
        # Hapus tanda baca, ubah ke lowercase, filter kata > 2 karakter
        tokens = re.findall(r'\b\w+\b', text.lower())
        return [token for token in tokens if len(token) > 2 and token not in self.stopwords]
    
    def build_inverted_index(self, docs):
        """Membangun inverted index dari dataset"""
        self.inverted_index = defaultdict(set)
        self.universal_set = set()
        
        for idx, doc in enumerate(docs):
            doc_id = idx + 1
            self.universal_set.add(doc_id)
            
            # Gabungkan judul dan konten
            text = f"{doc.get('judul', '')} {doc.get('konten', '')}"
            tokens = self.tokenize(text)
            
            for token in tokens:
                self.inverted_index[token].add(doc_id)
        
        print(f"✅ Inverted index berhasil dibangun!")
        print(f"📊 Total dokumen: {len(docs)}")
        print(f"📝 Total terms: {len(self.inverted_index)}")
        
        # Debug: tampilkan beberapa terms yang tersedia
        print("\n🔍 Sample terms yang tersedia:")
        sample_terms = list(self.inverted_index.keys())[:20]
        for i, term in enumerate(sample_terms):
            if i % 5 == 0:
                print()
            print(f"{term:12}", end=" ")
        print("\n")
    
    def boolean_search(self, query):
        """Melakukan pencarian Boolean dengan error handling yang lebih baik"""
        # Preprocessing query - hapus case sensitivity dan normalize
        query = query.lower().strip()
        
        # Replace operator symbols dengan kata
        query = re.sub(r'\s*\|\s*', ' or ', query)
        query = re.sub(r'\s*&\s*', ' and ', query)
        query = re.sub(r'\s*!\s*', ' not ', query)
        
        # Split query menjadi tokens, tapi pertahankan quoted phrases
        tokens = []
        current_token = ""
        in_quotes = False
        
        words = query.split()
        for word in words:
            if word.startswith('"') and word.endswith('"'):
                # Single quoted word
                tokens.append(word.strip('"'))
            elif word.startswith('"'):
                in_quotes = True
                current_token = word[1:]
            elif word.endswith('"') and in_quotes:
                current_token += " " + word[:-1]
                tokens.append(current_token)
                current_token = ""
                in_quotes = False
            elif in_quotes:
                current_token += " " + word
            else:
                tokens.append(word)
        
        if not tokens:
            return set()
        
        # Debug info
        print(f"🔍 Processing query: {query}")
        print(f"📝 Tokens: {tokens}")
        
        pos = 0
        
        def parse_term():
            nonlocal pos
            negated = False
            
            if pos < len(tokens) and tokens[pos] == 'not':
                negated = True
                pos += 1
            
            if pos >= len(tokens):
                raise ValueError("Operator NOT tidak diikuti dengan term")
            
            word = tokens[pos]
            pos += 1
            
            # Handle multi-word terms (phrase search)
            if ' ' in word:
                # Phrase search - cari dokumen yang mengandung semua kata
                phrase_words = self.tokenize(word)
                if not phrase_words:
                    return set()
                
                phrase_result = set(self.inverted_index.get(phrase_words[0], set()))
                for phrase_word in phrase_words[1:]:
                    phrase_result &= set(self.inverted_index.get(phrase_word, set()))
                postings = phrase_result
            else:
                # Single word search
                tokenized_word = self.tokenize(word)
                if tokenized_word:
                    postings = set(self.inverted_index.get(tokenized_word[0], set()))
                else:
                    postings = set()
            
            # Debug info untuk setiap term
            search_term = word if ' ' not in word else f'"{word}"'
            print(f"  📄 Term '{search_term}': {len(postings)} dokumen ditemukan")
            
            if negated:
                postings = self.universal_set - postings
                print(f"  ❌ NOT '{search_term}': {len(postings)} dokumen (negasi)")
            
            return postings
        
        try:
            result = parse_term()
            
            while pos < len(tokens):
                operator = tokens[pos]
                
                if operator not in ['and', 'or']:
                    # Skip unknown tokens atau treat as search term
                    print(f"⚠️  Warning: Token '{operator}' diabaikan atau diperlakukan sebagai term")
                    break
                
                pos += 1
                right_operand = parse_term()
                
                if operator == 'and':
                    old_count = len(result)
                    result &= right_operand
                    print(f"  🔗 AND operation: {old_count} ∩ {len(right_operand)} = {len(result)} dokumen")
                else:  # or
                    old_count = len(result)
                    result |= right_operand
                    print(f"  🔗 OR operation: {old_count} ∪ {len(right_operand)} = {len(result)} dokumen")
            
            return result
            
        except Exception as e:
            print(f"❌ Error dalam parsing query: {str(e)}")
            # Fallback: coba pencarian sederhana untuk setiap kata
            print("🔄 Mencoba pencarian fallback...")
            
            all_words = []
            for token in tokens:
                if token not in ['and', 'or', 'not']:
                    all_words.extend(self.tokenize(token))
            
            if all_words:
                # OR semua kata yang ditemukan
                fallback_result = set()
                for word in all_words:
                    fallback_result |= set(self.inverted_index.get(word, set()))
                    
                print(f"📄 Fallback result: {len(fallback_result)} dokumen ditemukan")
                return fallback_result
            
            return set()
    
    def highlight_text(self, text, query, max_length=200):
        """Highlight kata kunci dalam teks"""
        if not query or not text:
            return text[:max_length] + "..." if len(text) > max_length else text
        
        # Ambil kata kunci tanpa operator Boolean
        keywords = []
        for token in query.lower().split():
            if token not in ['and', 'or', 'not']:
                keywords.extend(self.tokenize(token))
        
        highlighted = text
        for keyword in set(keywords):  # Remove duplicates
            pattern = re.compile(re.escape(keyword), re.IGNORECASE)
            highlighted = pattern.sub(
                f'<mark style="background-color: yellow; padding: 2px 4px; border-radius: 3px;"><b>{keyword}</b></mark>', 
                highlighted
            )
        
        if len(highlighted) > max_length:
            highlighted = highlighted[:max_length] + "..."
        
        return highlighted
    
    def search_and_display(self, query):
        """Melakukan pencarian dan menampilkan hasil"""
        if not query.strip():
            print("❌ Masukkan query pencarian!")
            return
        
        print("="*80)
        result_ids = self.boolean_search(query)
        results = []
        
        for doc_id in sorted(result_ids):
            if doc_id <= len(self.dataset):
                doc = self.dataset[doc_id - 1].copy()
                doc['doc_id'] = doc_id
                results.append(doc)
        
        print(f"\n🎯 HASIL PENCARIAN")
        print(f"🔍 Query: '{query}'")
        print(f"📊 Ditemukan {len(results)} dokumen")
        print("="*80)
        
        if not results:
            print("\n❌ Tidak ada dokumen yang cocok dengan query Anda.")
            print("\n💡 Tips:")
            print("   - Periksa ejaan kata kunci")
            print("   - Coba gunakan kata yang lebih umum")
            print("   - Gunakan operator OR untuk mencari alternatif")
            print("   - Periksa terms yang tersedia di atas")
            return
        
        # Tampilkan hasil
        for idx, doc in enumerate(results[:10], 1):  # Tampilkan maksimal 10 hasil
            print(f"\n📄 DOKUMEN #{doc['doc_id']} - Hasil ke-{idx}")
            print("-"*60)
            
            # Judul dengan highlighting
            judul = doc.get('judul', 'Tanpa Judul')
            print(f"📰 JUDUL:")
            display(HTML(f"<h3 style='color: #2E86AB; margin: 10px 0;'>{self.highlight_text(judul, query, 500)}</h3>"))
            
            # Konten dengan highlighting
            konten = doc.get('konten', '')
            print(f"\n📝 KONTEN (Preview):")
            display(HTML(f"<div style='padding: 15px; background-color: #f8f9fa; border-left: 4px solid #2E86AB; margin: 10px 0; border-radius: 5px;'>{self.highlight_text(konten, query, 300)}</div>"))
            
            # Kategori jika ada
            if 'kategori' in doc and doc['kategori']:
                display(HTML(f"<span style='background-color: #e3f2fd; color: #1976d2; padding: 4px 8px; border-radius: 15px; font-size: 12px;'>🏷️ {doc['kategori']}</span>"))
            
            print(f"\n📊 Total karakter: {len(konten)}")
            print()
        
        if len(results) > 10:
            print(f"... dan {len(results) - 10} dokumen lainnya")

def demo_boolean_retrieval():
    """Demo sistem Boolean Retrieval dengan error handling yang lebih baik"""
    
    print("🔍 SISTEM BOOLEAN RETRIEVAL - BERITA OLAHRAGA")
    print("=" * 60)
    
    # Inisialisasi sistem
    system = BooleanRetrievalSystem()
    
    # Widget untuk upload file
    file_upload = widgets.FileUpload(
        accept='.csv',
        multiple=False,
        description='Upload CSV'
    )
    
    # Widget untuk query
    query_input = widgets.Text(
        placeholder='Contoh: Timnas Indonesia OR PSSI, sepak bola AND indonesia',
        description='Query:',
        style={'description_width': 'initial'},
        layout=widgets.Layout(width='600px')
    )
    
    search_button = widgets.Button(
        description='🔍 Cari',
        button_style='primary'
    )
    
    output = widgets.Output()
    
    def on_upload_change(change):
        with output:
            clear_output()
            try:
                if file_upload.value:
                    print("📁 Processing uploaded file...")
                    
                    # Get uploaded file
                    uploaded_files = file_upload.value
                    uploaded_file = list(uploaded_files.values())[0] if isinstance(uploaded_files, dict) else uploaded_files[0]
                    
                    # Read CSV
                    file_content = uploaded_file['content']
                    df = pd.read_csv(io.BytesIO(file_content))
                    
                    print(f"📊 Kolom yang tersedia: {list(df.columns)}")
                    
                    # Mapping kolom yang fleksibel
                    column_mapping = {}
                    judul_col = None
                    konten_col = None
                    kategori_col = None
                    
                    # Cari kolom judul
                    for col in df.columns:
                        if any(keyword in col.lower() for keyword in ['judul', 'title', 'headline']):
                            judul_col = col
                            break
                    
                    # Cari kolom konten/text  
                    for col in df.columns:
                        if any(keyword in col.lower() for keyword in ['text', 'content', 'konten', 'isi', 'berita']):
                            konten_col = col
                            break
                    
                    # Cari kolom kategori (opsional)
                    for col in df.columns:
                        if any(keyword in col.lower() for keyword in ['kategori', 'category', 'topic']):
                            kategori_col = col
                            break
                    
                    if not judul_col or not konten_col:
                        print("❌ Error: Tidak dapat menemukan kolom judul dan konten")
                        print("💡 Pastikan CSV memiliki kolom yang mengandung kata: judul/title dan text/content/konten")
                        print(f"📋 Kolom yang tersedia: {list(df.columns)}")
                        return
                    
                    print(f"✅ Kolom yang akan digunakan:")
                    print(f"   📰 Judul: '{judul_col}'")
                    print(f"   📝 Konten: '{konten_col}'")
                    if kategori_col:
                        print(f"   🏷️  Kategori: '{kategori_col}'")
                    
                    # Buat dataframe baru dengan nama kolom yang konsisten
                    df_renamed = pd.DataFrame()
                    df_renamed['judul'] = df[judul_col].fillna('')
                    df_renamed['konten'] = df[konten_col].fillna('')
                    if kategori_col:
                        df_renamed['kategori'] = df[kategori_col].fillna('')
                    
                    # Filter berdasarkan panjang minimal
                    def count_words(row):
                        judul_text = str(row.get('judul', '')) if pd.notna(row.get('judul')) else ''
                        konten_text = str(row.get('konten', '')) if pd.notna(row.get('konten')) else ''
                        text = f"{judul_text} {konten_text}"
                        return len(text.split())
                    
                    df_renamed['word_count'] = df_renamed.apply(count_words, axis=1)
                    df_filtered = df_renamed[df_renamed['word_count'] >= 20].copy()  # Kurangi lagi threshold
                    
                    print(f"📊 Statistik dataset:")
                    print(f"   📄 Total dokumen: {len(df)}")
                    print(f"   ✅ Dokumen valid (≥20 kata): {len(df_filtered)}")
                    print(f"   📈 Rata-rata kata per dokumen: {df_filtered['word_count'].mean():.1f}")
                    
                    if len(df_filtered) == 0:
                        print("❌ Error: Tidak ada dokumen yang memenuhi kriteria minimal 20 kata")
                        return
                    
                    system.dataset = df_filtered.to_dict('records')
                    system.build_inverted_index(system.dataset)
                    
                    print("🎉 Dataset siap untuk pencarian!")
                    
            except Exception as e:
                print(f"❌ Error: {str(e)}")
                import traceback
                print(f"Detail error: {traceback.format_exc()}")
    
    def on_search_click(b):
        with output:
            if not system.dataset:
                print("❌ Upload dataset terlebih dahulu!")
                return
            
            query = query_input.value.strip()
            if query:
                clear_output()
                system.search_and_display(query)
    
    # Event handlers
    file_upload.observe(on_upload_change, names='value')
    search_button.on_click(on_search_click)
    
    # Tampilkan interface
    print("1. Upload file CSV berita olahraga")
    print("2. Masukkan query dengan operator Boolean (AND, OR, NOT)")
    print("3. Klik tombol Cari")
    
    
    display(file_upload)
    display(widgets.HBox([query_input, search_button]))
    display(output)
    
    return system

# Jalankan demo
if __name__ == "__main__":
    system = demo_boolean_retrieval()

🔍 SISTEM BOOLEAN RETRIEVAL - BERITA OLAHRAGA
1. Upload file CSV berita olahraga
2. Masukkan query dengan operator Boolean (AND, OR, NOT)
3. Klik tombol Cari


FileUpload(value=(), accept='.csv', description='Upload CSV')

HBox(children=(Text(value='', description='Query:', layout=Layout(width='600px'), placeholder='Contoh: Timnas …

Output()