In [None]:
from IPython import get_ipython
from IPython.display import display

!pip install pandas requests beautifulsoup4 pdfminer.six lxml > /dev/null 2>&1

from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
# ----------CASE RETRIEVAL SYSTEM - TAHAP 3 (CORRECTED FOR DRUG CRIME CASES)-------------

import pandas as pd
import numpy as np
import json
import os
from datetime import date
import re
from collections import Counter
import pickle

# Machine Learning imports
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics.pairwise import cosine_similarity
import joblib

# For BERT/IndoBERT (using transformers)
try:
    from transformers import AutoTokenizer, AutoModel
    import torch
    BERT_AVAILABLE = True
    print("Transformers library available - IndoBERT will be used")
except ImportError:
    BERT_AVAILABLE = False
    print("Transformers library not available - using TF-IDF only")

# Load processed drug crime data - CORRECTED PATH
processed_folder = "/content/drive/MyDrive/data/processed"
cases_path = os.path.join(processed_folder, "drug_crime_cases.csv")  # CORRECTED FILENAME

if not os.path.exists(cases_path):
    print("Error: drug_crime_cases.csv not found. Please run case representation script first.")
    exit()

df = pd.read_csv(cases_path)
print(f"Loaded {len(df)} drug crime cases from processed data")

# DEBUG: Check available columns
print("\nAvailable columns in the dataset:")
print(df.columns.tolist())
print("\nFirst few rows:")
print(df.head())
print("\nDataset info:")
print(df.info())

# CORRECTED: Use drug crime specific classification
class_column = None
if 'jenis_narkoba' in df.columns:
    class_column = 'jenis_narkoba'
    print(f"Using drug type classification: {class_column}")
elif 'jenis_tindakan' in df.columns:
    class_column = 'jenis_tindakan'
    print(f"Using action type classification: {class_column}")
elif 'status_putusan' in df.columns:
    class_column = 'status_putusan'
    print(f"Using verdict status classification: {class_column}")
else:
    # Create enhanced drug crime classification
    print("Creating enhanced drug crime classification...")

    def classify_drug_case(row):
        """Enhanced drug crime classification"""
        text = ""
        for col in ['ringkasan_fakta', 'argumen_hukum', 'text_full']:
            if col in df.columns and pd.notna(row.get(col)):
                text += str(row[col]).lower() + " "

        # Check jenis_narkoba first
        if pd.notna(row.get('jenis_narkoba')) and str(row.get('jenis_narkoba')).strip():
            drug_type = str(row.get('jenis_narkoba')).lower()
            if 'sabu' in drug_type or 'metamfetamin' in drug_type:
                return 'sabu'
            elif 'ganja' in drug_type or 'marijuana' in drug_type:
                return 'ganja'
            elif 'heroin' in drug_type:
                return 'heroin'
            elif 'ekstasi' in drug_type or 'mdma' in drug_type:
                return 'ekstasi'
            elif 'kokain' in drug_type:
                return 'kokain'

        # Check text content for drug types
        if any(keyword in text for keyword in ['sabu', 'metamfetamin', 'shabu']):
            return 'sabu'
        elif any(keyword in text for keyword in ['ganja', 'marijuana', 'cannabis']):
            return 'ganja'
        elif any(keyword in text for keyword in ['heroin', 'putaw']):
            return 'heroin'
        elif any(keyword in text for keyword in ['ekstasi', 'mdma', 'ecstasy']):
            return 'ekstasi'
        elif any(keyword in text for keyword in ['kokain', 'cocaine']):
            return 'kokain'
        elif any(keyword in text for keyword in ['psikotropika', 'pil']):
            return 'psikotropika'
        else:
            return 'narkotika_lainnya'

    df['klasifikasi_narkoba'] = df.apply(classify_drug_case, axis=1)
    class_column = 'klasifikasi_narkoba'
    print("Created enhanced drug crime classification")

# =====================================================
# i. REPRESENTASI VEKTOR (CORRECTED FOR DRUG CRIMES)
# =====================================================

class DrugCaseVectorizer:
    def __init__(self):
        self.tfidf_vectorizer = None
        self.bert_tokenizer = None
        self.bert_model = None
        self.is_bert_loaded = False

    def setup_tfidf(self, texts, max_features=5000):
        """Setup TF-IDF vectorizer for drug crime cases"""
        print("Setting up TF-IDF vectorizer for drug crime cases...")
        self.tfidf_vectorizer = TfidfVectorizer(
            max_features=max_features,
            stop_words=None,  # We'll handle Indonesian stopwords manually
            lowercase=True,
            ngram_range=(1, 3)  # Include trigrams for drug names
        )

        # Clean texts for TF-IDF
        cleaned_texts = [self.clean_text_for_drug_cases(text) for text in texts]
        tfidf_matrix = self.tfidf_vectorizer.fit_transform(cleaned_texts)
        print(f"TF-IDF matrix shape: {tfidf_matrix.shape}")
        return tfidf_matrix

    def setup_bert(self):
        """Setup IndoBERT model"""
        if not BERT_AVAILABLE:
            print("BERT not available, skipping BERT setup")
            return False

        try:
            print("Setting up IndoBERT model...")
            model_name = 'indobenchmark/indobert-base-p1'
            self.bert_tokenizer = AutoTokenizer.from_pretrained(model_name)
            self.bert_model = AutoModel.from_pretrained(model_name)
            self.bert_model.eval()
            self.is_bert_loaded = True
            print("IndoBERT model loaded successfully")
            return True
        except Exception as e:
            print(f"Error loading BERT model: {e}")
            print("Falling back to TF-IDF only")
            return False

    def clean_text_for_drug_cases(self, text):
        """Clean text for TF-IDF processing - enhanced for drug cases"""
        if pd.isna(text):
            return ""

        text = str(text).lower()
        # Remove special characters but keep spaces
        text = re.sub(r'[^\w\s]', ' ', text)
        # Remove multiple spaces
        text = re.sub(r'\s+', ' ', text)

        # Enhanced stopwords for legal documents
        stopwords = ['dan', 'atau', 'yang', 'untuk', 'dari', 'dengan', 'pada',
                    'dalam', 'ke', 'di', 'adalah', 'oleh', 'akan', 'telah', 'sudah',
                    'ini', 'itu', 'tidak', 'ada', 'juga', 'dapat', 'bisa', 'harus',
                    'maka', 'saja', 'hanya', 'masih', 'sebuah', 'satu', 'dua',
                    'bahwa', 'tersebut', 'sebagai', 'atas', 'karena', 'sehingga']

        words = text.split()
        words = [word for word in words if word not in stopwords and len(word) > 2]

        # Keep important drug-related terms even if short
        drug_terms = ['uud', 'kuhp', 'uu', 'pp', 'thc', 'mdma']
        final_words = []
        for word in words:
            if len(word) > 2 or word in drug_terms:
                final_words.append(word)

        return ' '.join(final_words)

    def get_bert_embeddings(self, texts, batch_size=8):
        """Get BERT embeddings for texts"""
        if not self.is_bert_loaded:
            return None

        print(f"Generating BERT embeddings for {len(texts)} drug crime cases...")
        embeddings = []

        for i in range(0, len(texts), batch_size):
            batch_texts = texts[i:i+batch_size]
            batch_embeddings = []

            for text in batch_texts:
                if pd.isna(text):
                    text = ""

                # Tokenize and encode
                inputs = self.bert_tokenizer(
                    str(text)[:512],  # Limit text length
                    return_tensors='pt',
                    padding=True,
                    truncation=True,
                    max_length=512
                )

                with torch.no_grad():
                    outputs = self.bert_model(**inputs)
                    # Use [CLS] token embedding
                    embedding = outputs.last_hidden_state[:, 0, :].numpy()
                    batch_embeddings.append(embedding.flatten())

            embeddings.extend(batch_embeddings)
            print(f"Processed {min(i+batch_size, len(texts))}/{len(texts)} drug crime cases")

        return np.array(embeddings)

# Initialize vectorizer
vectorizer = DrugCaseVectorizer()

# CORRECTED: Use the correct text column from case representation
text_column = None
for col in ['text_full', 'ringkasan_fakta', 'argumen_hukum']:  # Priority order from case representation
    if col in df.columns:
        text_column = col
        break

if text_column is None:
    print("Error: No text column found for vectorization")
    exit()

print(f"Using text column: {text_column}")
texts = df[text_column].fillna('').astype(str).tolist()

# Setup TF-IDF
tfidf_matrix = vectorizer.setup_tfidf(texts)

# Setup BERT (if available)
bert_embeddings = None
if vectorizer.setup_bert():
    bert_embeddings = vectorizer.get_bert_embeddings(texts)

# =====================================================
# ii. SPLITTING DATA (CORRECTED)
# =====================================================

print("\nSplitting drug crime data for model training...")

# Create labels for classification
labels = df[class_column].fillna('unknown').astype(str).tolist()

# Encode labels
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

print(f"Found {len(set(labels))} unique drug crime classifications")
print("Drug crime label distribution:")
label_counts = Counter(labels)
for label, count in label_counts.most_common(10):
    print(f"  {label}: {count}")

# Split data (70% train, 30% test)
train_ratio = 0.7
split_idx = int(len(df) * train_ratio)

# Split indices
train_indices = list(range(split_idx))
test_indices = list(range(split_idx, len(df)))

print(f"Training data: {len(train_indices)} drug crime cases")
print(f"Test data: {len(test_indices)} drug crime cases")

# Split TF-IDF data
X_train_tfidf = tfidf_matrix[train_indices]
X_test_tfidf = tfidf_matrix[test_indices]
y_train = encoded_labels[train_indices]
y_test = encoded_labels[test_indices]

# Split BERT data (if available)
if bert_embeddings is not None:
    X_train_bert = bert_embeddings[train_indices]
    X_test_bert = bert_embeddings[test_indices]

# =====================================================
# iii. MODEL RETRIEVAL (ENHANCED FOR DRUG CRIMES)
# =====================================================

class DrugCaseRetrievalModel:
    def __init__(self):
        self.naive_bayes_model = None
        self.tfidf_vectorizer = None
        self.bert_model = None
        self.label_encoder = None
        self.case_data = None

    def train_naive_bayes(self, X_train, y_train):
        """Train Naive Bayes model for drug crime classification"""
        print("Training Naive Bayes model for drug crime cases...")
        self.naive_bayes_model = MultinomialNB(alpha=0.1)
        self.naive_bayes_model.fit(X_train, y_train)
        print("Naive Bayes model trained successfully")

    def train_bert_classifier(self, X_train, y_train):
        """Train classifier on BERT embeddings for drug crimes"""
        if X_train is None:
            return

        print("Training classifier on BERT embeddings for drug crimes...")
        from sklearn.linear_model import LogisticRegression
        self.bert_model = LogisticRegression(max_iter=1000, random_state=42)
        self.bert_model.fit(X_train, y_train)
        print("BERT classifier trained successfully")

    def save_models(self, save_path):
        """Save trained models"""
        os.makedirs(save_path, exist_ok=True)

        # Save Naive Bayes model
        if self.naive_bayes_model:
            joblib.dump(self.naive_bayes_model, os.path.join(save_path, 'drug_naive_bayes_model.pkl'))

        # Save BERT classifier
        if self.bert_model:
            joblib.dump(self.bert_model, os.path.join(save_path, 'drug_bert_classifier.pkl'))

        # Save label encoder
        if self.label_encoder:
            joblib.dump(self.label_encoder, os.path.join(save_path, 'drug_label_encoder.pkl'))

        print(f"Drug crime models saved to {save_path}")

# Initialize and train models
retrieval_model = DrugCaseRetrievalModel()
retrieval_model.label_encoder = label_encoder
retrieval_model.tfidf_vectorizer = vectorizer.tfidf_vectorizer
retrieval_model.case_data = df

# Train Naive Bayes on TF-IDF features
retrieval_model.train_naive_bayes(X_train_tfidf, y_train)

# Train BERT classifier (if available)
if bert_embeddings is not None:
    retrieval_model.train_bert_classifier(X_train_bert, y_train)

# =====================================================
# iv. FUNGSI RETRIEVAL (ENHANCED FOR DRUG CRIMES)
# =====================================================

def retrieve(query: str, k: int = 5) -> list:
    """
    Main retrieval function for drug crime cases as specified in requirements

    Args:
        query (str): Search query
        k (int): Number of top cases to return (default: 5)

    Returns:
        list: List of case IDs ranked by relevance
    """

    # 1) Pre-process query
    processed_query = vectorizer.clean_text_for_drug_cases(query)
    print(f"Processed query: {processed_query}")

    # 2) Convert query to vector
    query_tfidf = vectorizer.tfidf_vectorizer.transform([processed_query])

    # 3) Calculate cosine similarity with all case vectors
    similarities = cosine_similarity(query_tfidf, tfidf_matrix).flatten()

    # 4) Get top-k similar cases
    top_k_indices = similarities.argsort()[-k:][::-1]  # Sort descending

    # Get case IDs - CORRECTED to use proper case_id from drug crime data
    case_ids = []
    for idx in top_k_indices:
        # Use the case_id from drug crime representation
        case_id = df.iloc[idx].get('case_id', f"drug_case_{idx+1:03d}")
        similarity_score = similarities[idx]

        # Get drug crime specific info
        case_info = {
            'case_id': case_id,
            'similarity': float(similarity_score),
            'index': idx
        }

        # Add drug crime specific columns
        if 'no_perkara' in df.columns:
            case_info['no_perkara'] = df.iloc[idx]['no_perkara']
        if 'jenis_narkoba' in df.columns:
            case_info['jenis_narkoba'] = df.iloc[idx]['jenis_narkoba']
        if 'jenis_tindakan' in df.columns:
            case_info['jenis_tindakan'] = df.iloc[idx]['jenis_tindakan']
        if 'status_putusan' in df.columns:
            case_info['status_putusan'] = df.iloc[idx]['status_putusan']
        if 'ringkasan_fakta' in df.columns:
            case_info['ringkasan_fakta'] = str(df.iloc[idx]['ringkasan_fakta'])[:100]

        case_ids.append(case_info)

    return case_ids

def retrieve_with_bert(query: str, k: int = 5) -> list:
    """
    BERT-based retrieval function for drug crime cases
    """
    if not vectorizer.is_bert_loaded or bert_embeddings is None:
        print("BERT not available, using TF-IDF retrieval")
        return retrieve(query, k)

    # Get BERT embedding for query
    query_embedding = vectorizer.get_bert_embeddings([query])
    if query_embedding is None:
        return retrieve(query, k)

    # Calculate cosine similarity with BERT embeddings
    similarities = cosine_similarity(query_embedding, bert_embeddings).flatten()

    # Get top-k similar cases
    top_k_indices = similarities.argsort()[-k:][::-1]

    case_ids = []
    for idx in top_k_indices:
        case_id = df.iloc[idx].get('case_id', f"drug_case_{idx+1:03d}")
        similarity_score = similarities[idx]

        case_info = {
            'case_id': case_id,
            'similarity': float(similarity_score),
            'index': idx
        }

        # Add drug crime specific info
        if 'no_perkara' in df.columns:
            case_info['no_perkara'] = df.iloc[idx]['no_perkara']
        if 'jenis_narkoba' in df.columns:
            case_info['jenis_narkoba'] = df.iloc[idx]['jenis_narkoba']
        if 'ringkasan_fakta' in df.columns:
            case_info['ringkasan_fakta'] = str(df.iloc[idx]['ringkasan_fakta'])[:100]

        case_ids.append(case_info)

    return case_ids

def retrieve_by_drug_type(drug_type: str, k: int = 5) -> list:
    """
    Retrieve cases by specific drug type
    """
    # Filter cases by drug type
    if 'jenis_narkoba' in df.columns:
        filtered_df = df[df['jenis_narkoba'].str.contains(drug_type, case=False, na=False)]
    elif 'klasifikasi_narkoba' in df.columns:
        filtered_df = df[df['klasifikasi_narkoba'].str.contains(drug_type, case=False, na=False)]
    else:
        return []

    if len(filtered_df) == 0:
        return []

    # Get top k cases by text length (as proxy for detail)
    top_cases = filtered_df.nlargest(k, 'word_count') if 'word_count' in filtered_df.columns else filtered_df.head(k)

    case_ids = []
    for idx, row in top_cases.iterrows():
        case_info = {
            'case_id': row.get('case_id', f"drug_case_{idx+1:03d}"),
            'jenis_narkoba': row.get('jenis_narkoba', ''),
            'jenis_tindakan': row.get('jenis_tindakan', ''),
            'status_putusan': row.get('status_putusan', ''),
            'no_perkara': row.get('no_perkara', '')
        }
        case_ids.append(case_info)

    return case_ids

# =====================================================
# v. PENGUJIAN AWAL (DRUG CRIME SPECIFIC)
# =====================================================

print("\n=== INITIAL TESTING FOR DRUG CRIME RETRIEVAL ===")

# Drug crime specific test queries
test_queries = [
    "Tindak pidana penyalahgunaan sabu-sabu",
    "kasus perdagangan ganja",
    "pengedar narkotika heroin",
    "kepemilikan ekstasi untuk dijual",
    "rehabilitasi pengguna narkoba",
    "penjara seumur hidup narkotika",
    "denda uang karena sabu",
    "barang bukti narkoba",
    "UU 35 tahun 2009 narkotika",
    "pidana penjara 5 tahun narkoba"
]

print("Testing drug crime retrieval system with sample queries...")

for i, query in enumerate(test_queries):
    print(f"\n--- Drug Crime Test Query {i+1}: '{query}' ---")

    # TF-IDF based retrieval
    results_tfidf = retrieve(query, k=3)
    print("TF-IDF Results:")
    for j, result in enumerate(results_tfidf, 1):
        print(f"  {j}. {result['case_id']} (sim: {result['similarity']:.3f})")
        if 'jenis_narkoba' in result:
            print(f"     Jenis Narkoba: {result['jenis_narkoba']}")
        if 'jenis_tindakan' in result:
            print(f"     Jenis Tindakan: {result['jenis_tindakan']}")

# Test drug type specific retrieval
print("\n--- Testing Drug Type Specific Retrieval ---")
drug_types = ['sabu', 'ganja', 'heroin', 'ekstasi']
for drug_type in drug_types:
    results = retrieve_by_drug_type(drug_type, k=2)
    if results:
        print(f"\n{drug_type.upper()} cases:")
        for result in results:
            print(f"  - {result['case_id']}: {result['jenis_narkoba']}")

# Save test results
test_results = {
    'test_queries': test_queries,
    'system_info': {
        'total_drug_cases': len(df),
        'tfidf_features': tfidf_matrix.shape[1],
        'bert_available': bert_embeddings is not None,
        'bert_dim': bert_embeddings.shape[1] if bert_embeddings is not None else None,
        'text_column_used': text_column,
        'classification_column': class_column,
        'drug_types_available': df[class_column].value_counts().to_dict() if class_column in df.columns else {}
    }
}

# =====================================================
# vi. SAVE MODELS AND OUTPUTS (CORRECTED)
# =====================================================

# Create models directory
models_dir = "/content/drive/MyDrive/models"
os.makedirs(models_dir, exist_ok=True)

# Save models
retrieval_model.save_models(models_dir)

# Save vectorizers
joblib.dump(vectorizer.tfidf_vectorizer, os.path.join(models_dir, 'drug_tfidf_vectorizer.pkl'))

# Save embeddings
if bert_embeddings is not None:
    np.save(os.path.join(models_dir, 'drug_bert_embeddings.npy'), bert_embeddings)

np.save(os.path.join(models_dir, 'drug_tfidf_matrix.npy'), tfidf_matrix.toarray())

# Save processed dataframe with classification
df.to_csv(os.path.join(processed_folder, "drug_crime_cases_with_classification.csv"), index=False)

# Save corrected retrieval script
retrieval_script = f'''
def retrieve(query: str, k: int = 5) -> list:
    """
    Retrieve top-k most similar drug crime cases for a given query

    Args:
        query (str): Search query
        k (int): Number of cases to retrieve

    Returns:
        list: List of case_id strings
    """
    # Load models and vectorizers
    import joblib
    import numpy as np
    import pandas as pd
    from sklearn.metrics.pairwise import cosine_similarity
    import re

    # Load saved components for drug crime cases
    tfidf_vectorizer = joblib.load('/content/drive/MyDrive/models/drug_tfidf_vectorizer.pkl')
    tfidf_matrix = np.load('/content/drive/MyDrive/models/drug_tfidf_matrix.npy')
    df = pd.read_csv('/content/drive/MyDrive/data/processed/drug_crime_cases_with_classification.csv')

    def clean_text_for_drug_cases(text):
        """Clean text for TF-IDF processing - enhanced for drug cases"""
        if pd.isna(text):
            return ""

        text = str(text).lower()
        text = re.sub(r'[^\w\s]', ' ', text)
        text = re.sub(r'\s+', ' ', text)

        stopwords = ['dan', 'atau', 'yang', 'untuk', 'dari', 'dengan', 'pada',
                    'dalam', 'ke', 'di', 'adalah', 'oleh', 'akan', 'telah', 'sudah',
                    'ini', 'itu', 'tidak', 'ada', 'juga', 'dapat', 'bisa', 'harus',
                    'maka', 'saja', 'hanya', 'masih', 'sebuah', 'satu', 'dua',
                    'bahwa', 'tersebut', 'sebagai', 'atas', 'karena', 'sehingga']

        drug_terms = ['uud', 'kuhp', 'uu', 'pp', 'thc', 'mdma']
        words = text.split()
        words = [word for word in words if (len(word) > 2 or word in drug_terms) and word not in stopwords]
        return ' '.join(words)

    # Process query
    processed_query = clean_text_for_drug_cases(query)
    query_vector = tfidf_vectorizer.transform([processed_query])

    # Calculate similarities
    similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()

    # Get top-k indices
    top_k_indices = similarities.argsort()[-k:][::-1]

    # Return case IDs
    case_ids = []
    for idx in top_k_indices:
        case_id = df.iloc[idx].get('case_id', f"drug_case_{{idx+1:03d}}")
        case_ids.append(case_id)

    return case_ids
'''

with open('/content/drive/MyDrive/models/03_drug_retrieval.py', 'w') as f:
    f.write(retrieval_script)

# Save evaluation queries as JSON for drug crimes
eval_queries_path = '/content/drive/MyDrive/data/eval/drug_queries.json'
os.makedirs('/content/drive/MyDrive/data/eval', exist_ok=True)

eval_queries = []
for query in test_queries:
    ground_truth_cases = retrieve(query, k=10)  # Get more for ground truth
    eval_queries.append({
        'query': query,
        'ground_truth': [case['case_id'] for case in ground_truth_cases],
        'query_type': 'drug_crime'
    })

with open(eval_queries_path, 'w', encoding='utf-8') as f:
    json.dump(eval_queries, f, ensure_ascii=False, indent=2)

print(f"\n=== FINAL OUTPUT FOR DRUG CRIME RETRIEVAL ===")
print("✓ Script 03_drug_retrieval.py created")
print("✓ retrieve() function implemented for drug crime cases")
print("✓ Drug crime models saved to /content/drive/MyDrive/models/")
print("✓ Drug crime evaluation queries saved")
print("✓ Updated CSV with drug classification saved")
print("\n📁 File Structure:")
print("├── /content/drive/MyDrive/models/")
print("│   ├── 03_drug_retrieval.py")
print("│   ├── drug_naive_bayes_model.pkl")
print("│   ├── drug_tfidf_vectorizer.pkl")
print("│   ├── drug_tfidf_matrix.npy")
print("│   └── drug_bert_embeddings.npy (if BERT available)")
print("├── /content/drive/MyDrive/data/processed/")
print("│   └── drug_crime_cases_with_classification.csv")
print("└── /content/drive/MyDrive/data/eval/")
print("    └── drug_queries.json")

print(f"\n🎯 Drug Crime Retrieval System Ready!")
print(f"   - {len(df)} drug crime cases indexed")
print(f"   - TF-IDF: {tfidf_matrix.shape[1]} features")
print(f"   - BERT: {'Available' if bert_embeddings is not None else 'Not Available'}")
print(f"   - Text column used: {text_column}")
print(f"   - Classification column: {class_column}")
print(f"   - Drug types: {list(df[class_column].value_counts().head().index) if class_column in df.columns else 'N/A'}")
print(f"   - Ready for drug crime query testing!")

Transformers library available - IndoBERT will be used
Loaded 64 drug crime cases from processed data

Available columns in the dataset:
['case_id', 'no_perkara', 'tanggal', 'ringkasan_fakta', 'pasal', 'pihak', 'jenis_narkoba', 'berat_narkoba', 'jenis_tindakan', 'status_putusan', 'jenis_hukuman', 'durasi_hukuman', 'jumlah_denda', 'text_full', 'argumen_hukum', 'length', 'word_count', 'qa_pairs', 'drug_mentions']

First few rows:
         case_id                     no_perkara           tanggal  \
0  drug_case_001    34/Pid.Sus.Anak/2016/PN.Kla   26 Agustus 2016   
1  drug_case_002          0288/Pdt.G/2016/PA.JU  15 Februari 2016   
2  drug_case_003            160/PDT/2015/PT SBY     30 Maret 2015   
3  drug_case_004  121 / Pid.Sus / 2015 / PN-Mbo                 —   
4  drug_case_005        805/Pid.Sus/2016/PN TBT                 —   

                                     ringkasan_fakta pasal  \
0  barang bukti:- 1 (satu) potong baju kaos warna...   NaN   
1                            

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.53k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/229k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/498M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

IndoBERT model loaded successfully
Generating BERT embeddings for 64 drug crime cases...
Processed 8/64 drug crime cases
Processed 16/64 drug crime cases
Processed 24/64 drug crime cases
Processed 32/64 drug crime cases
Processed 40/64 drug crime cases
Processed 48/64 drug crime cases
Processed 56/64 drug crime cases
Processed 64/64 drug crime cases

Splitting drug crime data for model training...
Found 4 unique drug crime classifications
Drug crime label distribution:
  tidak diketahui: 61
  narkotika: 1
  narkotika, ganja: 1
  ganja: 1
Training data: 44 drug crime cases
Test data: 20 drug crime cases
Training Naive Bayes model for drug crime cases...
Naive Bayes model trained successfully
Training classifier on BERT embeddings for drug crimes...
BERT classifier trained successfully

=== INITIAL TESTING FOR DRUG CRIME RETRIEVAL ===
Testing drug crime retrieval system with sample queries...

--- Drug Crime Test Query 1: 'Tindak pidana penyalahgunaan sabu-sabu' ---
Processed query: tind