In [None]:
from IPython import get_ipython
from IPython.display import display

!pip install pandas requests beautifulsoup4 pdfminer.six lxml > /dev/null 2>&1

from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
# ----------CASE REPRESENTATION --------------

import pandas as pd
import re
import json
import os
from datetime import date, datetime
from collections import Counter
import numpy as np

# Load preprocessed drug crime data
today = date.today().strftime("%Y-%m-%d")
input_path = f"/content/drive/MyDrive/CSV/preprocessed_drug_crime_cases_{today}.csv"

# If today's file doesn't exist, find the most recent one
if not os.path.exists(input_path):
    csv_files = [f for f in os.listdir('/content/drive/MyDrive/CSV') if f.startswith('preprocessed_drug_crime_cases_') and f.endswith('.csv')]
    if csv_files:
        latest_file = max(csv_files, key=lambda x: os.path.getctime(f'/content/drive/MyDrive/CSV/{x}'))
        input_path = f"/content/drive/MyDrive/CSV/{latest_file}"
        print(f"Using latest preprocessed file: {latest_file}")
    else:
        print("No preprocessed drug crime file found!")
        exit()

df = pd.read_csv(input_path)
print(f"Loaded data shape: {df.shape}")

# =====================================================
# i. EKSTRAKSI METADATA (DRUG CRIME SPECIFIC)
# =====================================================

def extract_metadata(row):
    """Extract key metadata from each drug crime case"""
    metadata = {
        'case_id': f"drug_case_{row.name + 1:03d}",
        'no_perkara': str(row.get('nomor', '')),
        'tanggal': str(row.get('tanggal_register', '')),
        'ringkasan_fakta': '',
        'pasal': '',
        'pihak': '',
        'jenis_narkoba': str(row.get('jenis_narkoba', '')),
        'berat_narkoba': str(row.get('berat_narkoba', '')),
        'jenis_tindakan': str(row.get('jenis_tindakan', '')),
        'status_putusan': str(row.get('status_putusan', '')),
        'jenis_hukuman': str(row.get('jenis_hukuman', '')),
        'durasi_hukuman': str(row.get('durasi_hukuman', '')),
        'jumlah_denda': str(row.get('jumlah_denda', '')),
        'text_full': ''
    }

    # Extract date in standardized format
    if pd.notna(row.get('tanggal_register')):
        try:
            date_str = str(row['tanggal_register'])
            metadata['tanggal'] = date_str
        except:
            metadata['tanggal'] = str(row.get('tanggal_register', ''))

    # Extract pasal (legal articles) from various text fields
    pasal_patterns = []
    text_fields = ['amar', 'catatan_amar', 'abstrak', 'text_pdf', 'kaidah']

    for field in text_fields:
        if field in df.columns and pd.notna(row.get(field)):
            text = str(row[field]).lower()
            # Look for pasal patterns (including UU Narkotika specific)
            pasal_matches = re.findall(r'pasal\s*(\d+(?:\s*[a-z])?)', text)
            # Look for UU 35/2009 (Narkotika law) patterns
            uu_matches = re.findall(r'uu\s*(?:no\.?)?\s*35\s*/\s*2009', text)
            if uu_matches:
                pasal_matches.extend(['UU 35/2009'])
            pasal_patterns.extend(pasal_matches)

    # Remove duplicates and format
    unique_pasal = list(set(pasal_patterns))
    metadata['pasal'] = ', '.join(unique_pasal[:5]) if unique_pasal else ''

    # Extract pihak (parties involved) - using correct column names
    pihak_info = []
    if pd.notna(row.get('hakim_ketua')):
        pihak_info.append(f"Hakim Ketua: {row['hakim_ketua']}")
    if pd.notna(row.get('hakim_anggota')):
        pihak_info.append(f"Hakim Anggota: {row['hakim_anggota']}")
    if pd.notna(row.get('panitera')):
        pihak_info.append(f"Panitera: {row['panitera']}")

    metadata['pihak'] = '; '.join(pihak_info)

    # Combine full text from available fields
    full_text_parts = []
    for field in ['judul', 'abstrak', 'amar', 'catatan_amar', 'kaidah', 'text_pdf']:
        if field in df.columns and pd.notna(row.get(field)):
            full_text_parts.append(str(row[field]))

    metadata['text_full'] = ' '.join(full_text_parts)

    return metadata

# Apply metadata extraction
print("Extracting metadata...")
metadata_list = []
for idx, row in df.iterrows():
    metadata = extract_metadata(row)
    metadata_list.append(metadata)

metadata_df = pd.DataFrame(metadata_list)
print(f"Metadata extracted for {len(metadata_df)} cases")

# =====================================================
# ii. EKSTRAKSI KONTEN KUNCI (DRUG CRIME SPECIFIC)
# =====================================================

def extract_key_content(text):
    """Extract key legal content from drug crime case text"""
    if pd.isna(text):
        return {'ringkasan_fakta': '', 'argumen_hukum': ''}

    text = str(text).lower()

    # 1. Ringkasan fakta (specific to drug crimes)
    fakta_keywords = [
        'barang bukti', 'dakwaan', 'fakta', 'peristiwa', 'kejadian',
        'narkotika', 'sabu', 'ganja', 'heroin', 'ekstasi', 'kokain',
        'menangkap', 'menemukan', 'menggeledah', 'menyita'
    ]
    ringkasan_fakta = []

    for keyword in fakta_keywords:
        pattern = rf'{keyword}[^.]*[.]'
        matches = re.findall(pattern, text)
        ringkasan_fakta.extend(matches[:2])  # Limit to 2 matches per keyword

    # 2. Argumen hukum utama (drug crime specific)
    hukum_keywords = [
        'menyatakan', 'memutuskan', 'mengadili', 'pasal', 'undang-undang',
        'uu 35/2009', 'narkotika', 'bersalah', 'tidak bersalah', 'pidana penjara',
        'pidana denda', 'rehabilitasi'
    ]
    argumen_hukum = []

    for keyword in hukum_keywords:
        pattern = rf'{keyword}[^.]*[.]'
        matches = re.findall(pattern, text)
        argumen_hukum.extend(matches[:2])  # Limit to 2 matches per keyword

    return {
        'ringkasan_fakta': ' '.join(ringkasan_fakta)[:300],  # Increase limit for drug cases
        'argumen_hukum': ' '.join(argumen_hukum)[:300]
    }

# Apply key content extraction
print("Extracting key content...")
for idx, row in metadata_df.iterrows():
    original_row = df.iloc[idx]

    # Combine relevant text fields for content extraction
    combined_text = ''
    text_fields = ['amar', 'catatan_amar', 'abstrak', 'text_pdf', 'kaidah']
    for field in text_fields:
        if field in df.columns and pd.notna(original_row.get(field)):
            combined_text += ' ' + str(original_row[field])

    key_content = extract_key_content(combined_text)
    metadata_df.at[idx, 'ringkasan_fakta'] = key_content['ringkasan_fakta']
    metadata_df.at[idx, 'argumen_hukum'] = key_content['argumen_hukum']

# =====================================================
# iii. FEATURE ENGINEERING (DRUG CRIME ENHANCED)
# =====================================================

def calculate_text_features(text):
    """Calculate various text features for drug crime cases"""
    if pd.isna(text):
        return {'length': 0, 'word_count': 0, 'qa_pairs': 0, 'drug_mentions': 0}

    text = str(text)

    # Basic features
    length = len(text)
    word_count = len(text.split())

    # Count question-like patterns (for QA pairs approximation)
    question_patterns = ['?', 'apakah', 'mengapa', 'bagaimana', 'kapan', 'dimana']
    qa_pairs = sum(1 for pattern in question_patterns if pattern in text.lower())

    # Count drug-related mentions
    drug_keywords = ['narkotika', 'narkoba', 'sabu', 'ganja', 'heroin', 'ekstasi', 'kokain', 'psikotropika']
    drug_mentions = sum(1 for keyword in drug_keywords if keyword in text.lower())

    return {
        'length': length,
        'word_count': word_count,
        'qa_pairs': qa_pairs,
        'drug_mentions': drug_mentions
    }

def extract_bag_of_words(text, top_n=50):
    """Extract bag of words features for drug crime cases"""
    if pd.isna(text):
        return {}

    text = str(text).lower()
    # Remove common stopwords and clean text
    stopwords = ['dan', 'atau', 'yang', 'untuk', 'dari', 'dengan', 'pada', 'dalam', 'ke', 'di', 'adalah', 'oleh', 'akan', 'telah', 'sudah']

    # Extract words
    words = re.findall(r'\b[a-z]{3,}\b', text)  # Words with 3+ characters
    words = [word for word in words if word not in stopwords]

    # Count frequency
    word_freq = Counter(words)
    return dict(word_freq.most_common(top_n))

# Apply feature engineering
print("Applying feature engineering...")

# Calculate text features for each case
text_features_list = []
bag_of_words_list = []

for idx, row in metadata_df.iterrows():
    # Text features
    features = calculate_text_features(row['text_full'])
    text_features_list.append(features)

    # Bag of words
    bow = extract_bag_of_words(row['text_full'])
    bag_of_words_list.append(bow)

# Add text features to dataframe
for feature_name in ['length', 'word_count', 'qa_pairs', 'drug_mentions']:
    metadata_df[feature_name] = [features[feature_name] for features in text_features_list]

# =====================================================
# iv. PENYIMPANAN (DRUG CRIME SPECIFIC)
# =====================================================

# Create processed folder
processed_folder = "/content/drive/MyDrive/data/processed"
os.makedirs(processed_folder, exist_ok=True)

# 1. Save as CSV
csv_output_path = os.path.join(processed_folder, "drug_crime_cases.csv")
metadata_df.to_csv(csv_output_path, index=False)
print(f"CSV saved to: {csv_output_path}")

# 2. Save as JSON with drug crime specific structure
json_output_path = os.path.join(processed_folder, "drug_crime_cases.json")
json_data = []

for idx, row in metadata_df.iterrows():
    case_data = {
        'case_id': row['case_id'],
        'metadata': {
            'no_perkara': row['no_perkara'],
            'tanggal': row['tanggal'],
            'pasal': row['pasal'],
            'pihak': row['pihak']
        },
        'drug_info': {
            'jenis_narkoba': row['jenis_narkoba'],
            'berat_narkoba': row['berat_narkoba'],
            'jenis_tindakan': row['jenis_tindakan']
        },
        'verdict': {
            'status_putusan': row['status_putusan'],
            'jenis_hukuman': row['jenis_hukuman'],
            'durasi_hukuman': row['durasi_hukuman'],
            'jumlah_denda': row['jumlah_denda']
        },
        'content': {
            'ringkasan_fakta': row['ringkasan_fakta'],
            'argumen_hukum': row.get('argumen_hukum', ''),
            'text_full': row['text_full'][:500]  # Truncate for JSON
        },
        'features': {
            'length': int(row['length']),
            'word_count': int(row['word_count']),
            'qa_pairs': int(row['qa_pairs']),
            'drug_mentions': int(row['drug_mentions'])
        },
        'bag_of_words': bag_of_words_list[idx] if idx < len(bag_of_words_list) else {}
    }
    json_data.append(case_data)

with open(json_output_path, 'w', encoding='utf-8') as f:
    json.dump(json_data, f, ensure_ascii=False, indent=2)

print(f"JSON saved to: {json_output_path}")

# =====================================================
# v. OUTPUT: Data kasus narkoba dalam format .csv atau .json
# =====================================================

print("\n=== DRUG CRIME CASE REPRESENTATION SUMMARY ===")
print(f"Total drug crime cases processed: {len(metadata_df)}")
print(f"Average text length: {metadata_df['length'].mean():.0f} characters")
print(f"Average word count: {metadata_df['word_count'].mean():.0f} words")
print(f"Average drug mentions per case: {metadata_df['drug_mentions'].mean():.1f}")

# Display sample of the structured data
print("\n=== SAMPLE OUTPUT (CSV FORMAT) ===")
sample_columns = ['case_id', 'no_perkara', 'tanggal', 'jenis_narkoba', 'jenis_tindakan', 'status_putusan', 'length', 'word_count']
available_sample_columns = [col for col in sample_columns if col in metadata_df.columns]
print(metadata_df[available_sample_columns].head())

print("\n=== DRUG CRIME SPECIFIC STATISTICS ===")
if 'jenis_narkoba' in metadata_df.columns:
    print("Jenis Narkoba Distribution:")
    print(metadata_df['jenis_narkoba'].value_counts().head())

if 'jenis_tindakan' in metadata_df.columns:
    print("\nJenis Tindakan Distribution:")
    print(metadata_df['jenis_tindakan'].value_counts().head())

if 'status_putusan' in metadata_df.columns:
    print("\nStatus Putusan Distribution:")
    print(metadata_df['status_putusan'].value_counts().head())

print("\n=== COLUMN INFORMATION ===")
print("Columns in output CSV:")
for col in metadata_df.columns:
    print(f"- {col}")

print(f"\nFiles saved in: {processed_folder}/")
print("- drug_crime_cases.csv (structured tabular data)")
print("- drug_crime_cases.json (nested JSON format)")

# Create a summary statistics file for drug crimes
summary_stats = {
    'total_cases': len(metadata_df),
    'average_text_length': float(metadata_df['length'].mean()),
    'average_word_count': float(metadata_df['word_count'].mean()),
    'average_drug_mentions': float(metadata_df['drug_mentions'].mean()),
    'unique_articles': len(set([pasal for pasal_list in metadata_df['pasal'] for pasal in str(pasal_list).split(',') if pasal.strip()])),
    'drug_types': metadata_df['jenis_narkoba'].value_counts().to_dict() if 'jenis_narkoba' in metadata_df.columns else {},
    'verdict_distribution': metadata_df['status_putusan'].value_counts().to_dict() if 'status_putusan' in metadata_df.columns else {},
    'processing_date': today
}

summary_path = os.path.join(processed_folder, "drug_crime_processing_summary.json")
with open(summary_path, 'w', encoding='utf-8') as f:
    json.dump(summary_stats, f, ensure_ascii=False, indent=2)

print(f"Processing summary saved to: {summary_path}")

# Display final structure
print(f"\n=== FINAL DRUG CRIME DATA STRUCTURE ===")
print("CSV Structure:")
print("case_id | no_perkara | tanggal | jenis_narkoba | jenis_tindakan | status_putusan | jenis_hukuman | ringkasan_fakta | pasal | pihak")
print("--------|------------|---------|---------------|----------------|----------------|---------------|-----------------|-------|-------")
for idx in range(min(3, len(metadata_df))):
    row = metadata_df.iloc[idx]
    print(f"{row['case_id']} | {row['no_perkara'][:10]}... | {row['tanggal'][:10]} | {str(row['jenis_narkoba'])[:12]}... | {str(row['jenis_tindakan'])[:13]}... | {str(row['status_putusan'])[:13]}... | {str(row['jenis_hukuman'])[:12]}... | {str(row['ringkasan_fakta'])[:12]}... | {str(row['pasal'])[:8]}... | {str(row['pihak'])[:8]}...")

print(f"\nData kasus narkoba berhasil direpresentasikan dalam struktur terorganisir!")

Loaded data shape: (64, 27)
Extracting metadata...
Metadata extracted for 64 cases
Extracting key content...
Applying feature engineering...
CSV saved to: /content/drive/MyDrive/data/processed/drug_crime_cases.csv
JSON saved to: /content/drive/MyDrive/data/processed/drug_crime_cases.json

=== DRUG CRIME CASE REPRESENTATION SUMMARY ===
Total drug crime cases processed: 64
Average text length: 9156 characters
Average word count: 1243 words
Average drug mentions per case: 0.1

=== SAMPLE OUTPUT (CSV FORMAT) ===
         case_id                     no_perkara           tanggal  \
0  drug_case_001    34/Pid.Sus.Anak/2016/PN.Kla   26 Agustus 2016   
1  drug_case_002          0288/Pdt.G/2016/PA.JU  15 Februari 2016   
2  drug_case_003            160/PDT/2015/PT SBY     30 Maret 2015   
3  drug_case_004  121 / Pid.Sus / 2015 / PN-Mbo                 —   
4  drug_case_005        805/Pid.Sus/2016/PN TBT                 —   

     jenis_narkoba        jenis_tindakan   status_putusan  length  word