In [None]:
from IPython import get_ipython
from IPython.display import display

!pip install pandas requests beautifulsoup4 pdfminer.six lxml > /dev/null 2>&1

from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
# ----SCRAPPING----------
import argparse
import io
import os
import re
import time
import urllib
from concurrent.futures import ThreadPoolExecutor, wait
from datetime import date
import pandas as pd
import requests
from bs4 import BeautifulSoup
from pdfminer import high_level

def create_path(folder_name):
    path = os.path.join(os.getcwd(), folder_name)
    if not os.path.exists(path):
        os.makedirs(path)
    return path
def open_page(link):
    count = 0
    while count < 3:
        try:
            return BeautifulSoup(requests.get(link).text, "lxml")
        except:
            count += 1
            time.sleep(5)

def get_detail(soup, keyword):
    try:
        text = (
            soup.find(lambda tag: tag.name == "td" and keyword in tag.text)
            .find_next()
            .get_text()
            .strip()
        )
        return text
    except:
        return ""

def get_pdf(url, path_pdf):
    file = urllib.request.urlopen(url)
    file_name = file.info().get_filename().replace("/", " ")
    file_content = file.read()
    with open(f"{path_pdf}/{file_name}", "wb") as out_file:
        out_file.write(file_content)
    return io.BytesIO(file_content), file_name
def clean_text(text):
    text = text.replace("M a h ka m a h A g u n g R e p u blik In d o n esia\n", "")
    text = text.replace("Disclaimer\n", "")
    text = text.replace(
        "Kepaniteraan Mahkamah Agung Republik Indonesia berusaha untuk selalu mencantumkan informasi paling kini dan akurat sebagai bentuk komitmen Mahkamah Agung untuk pelayanan publik, transparansi dan akuntabilitas\n",
        "",
    )
    text = text.replace(
        "pelaksanaan fungsi peradilan. Namun dalam hal-hal tertentu masih dimungkinkan terjadi permasalahan teknis terkait dengan akurasi dan keterkinian informasi yang kami sajikan, hal mana akan terus kami perbaiki dari waktu kewaktu.\n",
        "",
    )
    text = text.replace(
        "Dalam hal Anda menemukan inakurasi informasi yang termuat pada situs ini atau informasi yang seharusnya ada, namun belum tersedia, maka harap segera hubungi Kepaniteraan Mahkamah Agung RI melalui :\n",
        "",
    )
    text = text.replace(
        "Email : kepaniteraan@mahkamahagung.go.id    Telp : 021-384 3348 (ext.318)\n",
        "",
    )
    return text

def is_url_already_scraped(url, destination):
    """
    Checks if a URL has already been scraped and saved in the CSV file.

    Args:
        url (str): The URL to check.
        destination (str): The path to the output CSV file.

    Returns:
        bool: True if the URL exists in the CSV, False otherwise.
    """
    if not os.path.isfile(f"{destination}.csv"):
      return False

    try:
        df = pd.read_csv(f"{destination}.csv")
        return url in df["link"].values
    except pd.errors.EmptyDataError:
      return False

def extract_data(link, keyword_url):
    global today
    global path_output
    global path_pdf
    global download_pdf

    path_output = '/content/drive/MyDrive/CSV'
    path_pdf = '/content/drive/MyDrive/PDF'
    today = date.today().strftime("%Y-%m-%d")

    keyword_url = keyword_url.replace("/", " ")
    if keyword_url.startswith("https"):
        keyword_url = ""
    destination = f"{path_output}/putusan_ma_{keyword_url}_{today}"

    if is_url_already_scraped(link, destination):
        print(f"Skipping duplicate URL: {link}")
        return

    soup = open_page(link)
    table = soup.find("table", {"class": "table"})
    judul = table.find("h2").text
    table.find("h2").decompose()

    nomor = get_detail(table, "Nomor")
    tingkat_proses = get_detail(table, "Tingkat Proses")
    klasifikasi = get_detail(table, "Klasifikasi")
    kata_kunci = get_detail(table, "Kata Kunci")
    tahun = get_detail(table, "Tahun")
    tanggal_register = get_detail(table, "Tanggal Register")
    lembaga_peradilan = get_detail(table, "Lembaga Peradilan")
    jenis_lembaga_peradilan = get_detail(table, "Jenis Lembaga Peradilan")
    hakim_ketua = get_detail(table, "Hakim Ketua")
    hakim_anggota = get_detail(table, "Hakim Anggota")
    panitera = get_detail(table, "Panitera")
    amar = get_detail(table, "Amar")
    amar_lainnya = get_detail(table, "Amar Lainnya")
    catatan_amar = get_detail(table, "Catatan Amar")
    tanggal_musyawarah = get_detail(table, "Tanggal Musyawarah")
    tanggal_dibacakan = get_detail(table, "Tanggal Dibacakan")
    kaidah = get_detail(table, "Kaidah")
    abstrak = get_detail(table, "Abstrak")

    try:
        link_pdf = soup.find("a", href=re.compile(r"/pdf/"))["href"]
        file_pdf, file_name_pdf = get_pdf(link_pdf, path_pdf)
        text_pdf = high_level.extract_text(file_pdf)
        text_pdf = clean_text(text_pdf)
    except:
        link_pdf = ""
        text_pdf = ""
        file_name_pdf = ""

    data = [
        judul,
        nomor,
        tingkat_proses,
        klasifikasi,
        kata_kunci,
        tahun,
        tanggal_register,
        lembaga_peradilan,
        jenis_lembaga_peradilan,
        hakim_ketua,
        hakim_anggota,
        panitera,
        amar,
        amar_lainnya,
        catatan_amar,
        tanggal_musyawarah,
        tanggal_dibacakan,
        kaidah,
        abstrak,
        link,
        link_pdf,
        file_name_pdf,
        text_pdf,
    ]
    result = pd.DataFrame(
        [data],
        columns=[
            "judul",
            "nomor",
            "tingkat_proses",
            "klasifikasi",
            "kata_kunci",
            "tahun",
            "tanggal_register",
            "lembaga_peradilan",
            "jenis_lembaga_peradilan",
            "hakim_ketua",
            "hakim_anggota",
            "panitera",
            "amar",
            "amar_lainnya",
            "catatan_amar",
            "tanggal_musyawarah",
            "tanggal_dibacakan",
            "kaidah",
            "abstrak",
            "link",
            "link_pdf",
            "file_name_pdf",
            "text_pdf",
        ],
    )

    print(destination)
    if not os.path.isfile(f"{destination}.csv"):
        result.to_csv(f"{destination}.csv", header=True, index=False)
    else:
        result.to_csv(f"{destination}.csv", mode="a", header=False, index=False)

def run_scraper(keyword=None, url=None, sort_date=True, download_pdf=True):
    if not keyword and not url:
        print("Please provide a keyword or URL")
        return

    path_output = '/content/drive/MyDrive/CSV'
    path_pdf = '/content/drive/MyDrive/PDF'
    today = date.today().strftime("%Y-%m-%d")

    # Limit total data to 50 entries and calculate max page accordingly
    max_entries = 50
    entries_per_page = 20
    max_pages = (max_entries + entries_per_page - 1) // entries_per_page  # ceiling division

    if url:
        link = url
    else:
        link = f"https://putusan3.mahkamahagung.go.id/search.html?q={keyword}&page=1"

    soup = open_page(link)
    last_page = int(
        soup.find_all("a", {"class": "page-link"})[-1].get("data-ci-pagination-page")
    )

    # Make sure last_page does not exceed max_pages to scrape only 50 data
    last_page = min(last_page, max_pages)

    if url:
        print(f"Scraping with url: {url} - Up to {max_entries} data - {last_page} page(s)")
    else:
        print(f"Scraping with keyword: {keyword} - Up to {max_entries} data - {last_page} page(s)")

    if url:
        keyword_url = url
    else:
        keyword_url = keyword

    futures = []
    with ThreadPoolExecutor(max_workers=4) as executor:
        for page in range(last_page):
            futures.append(executor.submit(run_process, keyword_url, page + 1, sort_date))
    wait(futures)


def run_process(keyword_url, page, sort_page):
    if keyword_url.startswith("https"):
        link = f"{keyword_url}&page={page}"
    else:
        link = f"https://putusan3.mahkamahagung.go.id/search.html?q={keyword_url}&page={page}"
    if sort_page:
        link = f"{link}&obf=TANGGAL_PUTUS&obm=desc"

    print(link)

    soup = open_page(link)
    links = soup.find_all("a", {"href": re.compile("/direktori/putusan")})

    for link in links:
        extract_data(link["href"], keyword_url)

def scrape_specific_url(url, download_pdf=True):
    if not url or not url.startswith("https://"):
        print("Please provide a valid URL")
        return

    path_output = '/content/drive/MyDrive/CSV'
    path_pdf = '/content/drive/MyDrive/PDF'
    today = date.today().strftime("%Y-%m-%d")

    extract_data(url, url, path_output, path_pdf, today)

run_scraper(url="https://putusan3.mahkamahagung.go.id/search?q=Tindak+pidana+penyalahgunaan+dan+peredaran+narkoba")

Scraping with url: https://putusan3.mahkamahagung.go.id/search?q=Tindak+pidana+penyalahgunaan+dan+peredaran+narkoba - Up to 50 data - 3 page(s)
https://putusan3.mahkamahagung.go.id/search?q=Tindak+pidana+penyalahgunaan+dan+peredaran+narkoba&page=1&obf=TANGGAL_PUTUS&obm=desc
https://putusan3.mahkamahagung.go.id/search?q=Tindak+pidana+penyalahgunaan+dan+peredaran+narkoba&page=2&obf=TANGGAL_PUTUS&obm=desc
https://putusan3.mahkamahagung.go.id/search?q=Tindak+pidana+penyalahgunaan+dan+peredaran+narkoba&page=3&obf=TANGGAL_PUTUS&obm=desc
/content/drive/MyDrive/CSV/putusan_ma__2025-06-20
/content/drive/MyDrive/CSV/putusan_ma__2025-06-20
/content/drive/MyDrive/CSV/putusan_ma__2025-06-20
/content/drive/MyDrive/CSV/putusan_ma__2025-06-20
/content/drive/MyDrive/CSV/putusan_ma__2025-06-20
/content/drive/MyDrive/CSV/putusan_ma__2025-06-20
/content/drive/MyDrive/CSV/putusan_ma__2025-06-20
/content/drive/MyDrive/CSV/putusan_ma__2025-06-20
/content/drive/MyDrive/CSV/putusan_ma__2025-06-20
/content/driv

In [None]:
# -----PREPROCESSING FOR DRUG CRIME CASES------------
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import re
import os
from datetime import date

# Load the scraped data
csv_files = [f for f in os.listdir('/content/drive/MyDrive/CSV') if f.endswith('.csv')]
if csv_files:
    # Use the most recent CSV file
    latest_file = max(csv_files, key=lambda x: os.path.getctime(f'/content/drive/MyDrive/CSV/{x}'))
    df = pd.read_csv(f'/content/drive/MyDrive/CSV/{latest_file}')
    print(f"Loaded data from: {latest_file}")
else:
    print("No CSV files found in the CSV directory")
    df = pd.DataFrame()

print(f"Original data shape: {df.shape}")
print(df.head())

# 1. Remove duplicates
df = df.drop_duplicates()
print(f"After removing duplicates: {df.shape}")

# 2. Drop unnecessary columns (as specified in your code)
columns_to_drop = [
    'link', 'link_pdf', 'file_name_pdf'  # Keep text_pdf as it might be needed
]
# Only drop columns that exist in the dataframe
existing_columns_to_drop = [col for col in columns_to_drop if col in df.columns]
df = df.drop(columns=existing_columns_to_drop)
print(f"Dropped columns: {existing_columns_to_drop}")

# 3. Enhanced text preprocessing function specifically for legal documents
def preprocess_text(text):
    """
    Comprehensive text preprocessing for legal documents
    """
    if pd.isna(text):
        return ""

    # Convert to string and lowercase
    text = str(text).lower()

    # Remove specific legal document headers/footers (matching the clean_text function from scraper)
    text = text.replace("m a h ka m a h a g u n g r e p u blik in d o n esia\n", "")
    text = text.replace("disclaimer\n", "")
    text = text.replace(
        "kepaniteraan mahkamah agung republik indonesia berusaha untuk selalu mencantumkan informasi paling kini dan akurat sebagai bentuk komitmen mahkamah agung untuk pelayanan publik, transparansi dan akuntabilitas\n",
        "",
    )
    text = text.replace(
        "pelaksanaan fungsi peradilan. namun dalam hal-hal tertentu masih dimungkinkan terjadi permasalahan teknis terkait dengan akurasi dan keterkinian informasi yang kami sajikan, hal mana akan terus kami perbaiki dari waktu kewaktu.\n",
        "",
    )
    text = text.replace(
        "dalam hal anda menemukan inakurasi informasi yang termuat pada situs ini atau informasi yang seharusnya ada, namun belum tersedia, maka harap segera hubungi kepaniteraan mahkamah agung ri melalui :\n",
        "",
    )
    text = text.replace(
        "email : kepaniteraan@mahkamahagung.go.id    telp : 021-384 3348 (ext.318)\n",
        "",
    )

    # Additional specific cleaning for drug crime cases
    text = re.sub(r'm\s*a\s*h\s*k\s*a\s*m\s*a\s*h\s*\s*a\s*g\s*u\s*n\s*g.*?indonesia', '', text, flags=re.IGNORECASE)
    text = re.sub(r'disclaimer.*?waktu', '', text, flags=re.IGNORECASE | re.DOTALL)
    text = re.sub(r'kepaniteraan.*?go\.id', '', text, flags=re.IGNORECASE)

    # Remove "MENGADILI" variations
    text = re.sub(r'm\s*e\s*n\s*g\s*a\s*d\s*i\s*l\s*i\s*:?', '', text)

    # Remove excessive whitespace and normalize
    text = re.sub(r'\s+', ' ', text)

    # Remove special characters but keep basic punctuation
    text = re.sub(r'[^\w\s\.,;:!?()-]', '', text)

    # Remove watermarks and page numbers
    text = re.sub(r'halaman\s*\d+', '', text)
    text = re.sub(r'page\s*\d+', '', text)

    return text.strip()

# 4. Apply preprocessing to all text columns
text_columns = ['judul', 'amar', 'amar_lainnya', 'catatan_amar', 'kaidah', 'abstrak', 'text_pdf']
for col in text_columns:
    if col in df.columns:
        df[col] = df[col].apply(preprocess_text)
        print(f"Preprocessed column: {col}")

# 5. Drug-specific information extraction functions
def extract_drug_type(text):
    """
    Extract type of drug from legal text - specific for Indonesian drug cases
    """
    if pd.isna(text):
        return ""

    text = str(text).lower()

    # Common drug types in Indonesian legal cases
    drug_patterns = {
        'narkotika': r'narkotika|narkoba',
        'psikotropika': r'psikotropika',
        'sabu': r'sabu|metamfetamin|methamphetamine',
        'ganja': r'ganja|marihuana|marijuana|cannabis',
        'heroin': r'heroin',
        'kokain': r'kokain|cocaine',
        'ekstasi': r'ekstasi|ecstasy|mdma',
        'tramadol': r'tramadol',
        'pil_koplo': r'pil koplo|carnophen',
        'prekursor': r'prekursor'
    }

    detected_drugs = []
    for drug_name, pattern in drug_patterns.items():
        if re.search(pattern, text):
            detected_drugs.append(drug_name)

    return ', '.join(detected_drugs) if detected_drugs else 'tidak diketahui'

def extract_drug_weight(text):
    """
    Extract drug weight/amount from legal text
    """
    if pd.isna(text):
        return ""

    text = str(text).lower()

    # Look for weight patterns
    weight_patterns = [
        r'(\d+(?:\.\d+)?)\s*gram',
        r'(\d+(?:\.\d+)?)\s*kg',
        r'(\d+(?:\.\d+)?)\s*kilogram',
        r'(\d+(?:\.\d+)?)\s*ons',
        r'(\d+(?:\.\d+)?)\s*ton'
    ]

    for pattern in weight_patterns:
        match = re.search(pattern, text)
        if match:
            return match.group(0)

    return ""

def extract_drug_action(text):
    """
    Extract type of drug-related action from legal text
    """
    if pd.isna(text):
        return ""

    text = str(text).lower()

    # Look for action patterns
    action_patterns = {
        'mengedarkan': r'mengedarkan|mengedar|peredaran',
        'menjual': r'menjual|jual|perdagangan',
        'membeli': r'membeli|beli',
        'menyimpan': r'menyimpan|simpan|kepemilikan',
        'menggunakan': r'menggunakan|gunakan|pakai|penyalahgunaan',
        'memproduksi': r'memproduksi|produksi|membuat',
        'mengimpor': r'mengimpor|impor',
        'mengekspor': r'mengekspor|ekspor',
        'mentransportasi': r'mentransportasi|transport|mengangkut'
    }

    detected_actions = []
    for action_name, pattern in action_patterns.items():
        if re.search(pattern, text):
            detected_actions.append(action_name)

    return ', '.join(detected_actions) if detected_actions else 'tidak diketahui'

def extract_verdict_status(text):
    """
    Extract verdict status from legal text
    """
    if pd.isna(text):
        return ""

    text = str(text).lower()

    # Look for common verdict patterns
    if re.search(r'menyatakan.*?bersalah', text):
        return "bersalah"
    elif re.search(r'menyatakan.*?tidak\s*bersalah', text):
        return "tidak bersalah"
    elif re.search(r'menyatakan.*?bebas', text):
        return "bebas"
    elif re.search(r'menyatakan.*?lepas', text):
        return "lepas"
    else:
        return "tidak diketahui"

def extract_punishment_type(text):
    """
    Extract type of punishment from legal text
    """
    if pd.isna(text):
        return ""

    text = str(text).lower()

    # Look for punishment patterns
    punishment_types = []
    if re.search(r'pidana\s*penjara', text):
        punishment_types.append("pidana penjara")
    if re.search(r'pidana\s*denda', text):
        punishment_types.append("pidana denda")
    if re.search(r'pidana\s*percobaan', text):
        punishment_types.append("pidana percobaan")
    if re.search(r'pidana\s*bersyarat', text):
        punishment_types.append("pidana bersyarat")
    if re.search(r'rehabilitasi', text):
        punishment_types.append("rehabilitasi")

    return ', '.join(punishment_types) if punishment_types else "tidak diketahui"

def extract_sentence_duration(text):
    """
    Extract sentence duration from legal text
    """
    if pd.isna(text):
        return ""

    text = str(text).lower()

    # Look for time duration patterns
    duration_patterns = [
        r'(\d+)\s*tahun\s*(\d+)\s*bulan',
        r'(\d+)\s*tahun',
        r'(\d+)\s*bulan',
        r'(\d+)\s*hari'
    ]

    for pattern in duration_patterns:
        match = re.search(pattern, text)
        if match:
            return match.group(0)

    return ""

def extract_fine_amount(text):
    """
    Extract fine amount from legal text
    """
    if pd.isna(text):
        return ""

    text = str(text).lower()

    # Look for fine amount patterns
    fine_patterns = [
        r'denda.*?rp\.?\s*(\d+(?:\.\d+)*(?:\s*(?:juta|ribu|miliar))?)',
        r'rp\.?\s*(\d+(?:\.\d+)*(?:\s*(?:juta|ribu|miliar))?)'
    ]

    for pattern in fine_patterns:
        match = re.search(pattern, text)
        if match:
            return match.group(1)

    return ""

# 6. Apply drug-specific extraction functions
combined_text_columns = ['amar', 'catatan_amar', 'text_pdf', 'kaidah', 'abstrak']

# Create combined text for better extraction
df['combined_text'] = df[combined_text_columns].fillna('').apply(lambda x: ' '.join(x.astype(str)), axis=1)

# Apply extraction functions
df['jenis_narkoba'] = df['combined_text'].apply(extract_drug_type)
df['berat_narkoba'] = df['combined_text'].apply(extract_drug_weight)
df['jenis_tindakan'] = df['combined_text'].apply(extract_drug_action)
df['status_putusan'] = df['combined_text'].apply(extract_verdict_status)
df['jenis_hukuman'] = df['combined_text'].apply(extract_punishment_type)
df['durasi_hukuman'] = df['combined_text'].apply(extract_sentence_duration)
df['jumlah_denda'] = df['combined_text'].apply(extract_fine_amount)

# Drop the temporary combined_text column
df = df.drop(columns=['combined_text'])

# 7. Clean and standardize date columns
date_columns = ['tanggal_register', 'tanggal_musyawarah', 'tanggal_dibacakan']
for col in date_columns:
    if col in df.columns:
        # Basic date cleaning - remove extra spaces and standardize format
        df[col] = df[col].astype(str).str.replace(r'\s+', ' ', regex=True).str.strip()

# 8. Clean and standardize other categorical columns
categorical_columns = ['tingkat_proses', 'klasifikasi', 'lembaga_peradilan', 'jenis_lembaga_peradilan']
for col in categorical_columns:
    if col in df.columns:
        df[col] = df[col].astype(str).str.lower().str.strip()

# 9. Handle missing values with drug-specific defaults
df = df.fillna({
    'kata_kunci': 'narkoba',
    'hakim_ketua': 'tidak diketahui',
    'hakim_anggota': 'tidak diketahui',
    'panitera': 'tidak diketahui',
    'status_putusan': 'tidak diketahui',
    'jenis_hukuman': 'tidak diketahui',
    'durasi_hukuman': 'tidak diketahui',
    'jumlah_denda': 'tidak diketahui',
    'jenis_narkoba': 'tidak diketahui',
    'berat_narkoba': 'tidak diketahui',
    'jenis_tindakan': 'tidak diketahui'
})

# 10. Data validation and cleaning
def validate_year(year):
    """Validate and clean year data"""
    if pd.isna(year):
        return None
    try:
        year_int = int(float(str(year)))
        if 1900 <= year_int <= 2025:
            return year_int
        else:
            return None
    except:
        return None

if 'tahun' in df.columns:
    df['tahun'] = df['tahun'].apply(validate_year)

# 11. Create summary statistics for drug crime cases
print("\n=== DRUG CRIME PREPROCESSING SUMMARY ===")
print(f"Final data shape: {df.shape}")
print(f"Total missing values: {df.isnull().sum().sum()}")

print(f"\nJenis Narkoba Distribution:")
print(df['jenis_narkoba'].value_counts())

print(f"\nJenis Tindakan Distribution:")
print(df['jenis_tindakan'].value_counts())

print(f"\nStatus Putusan Distribution:")
print(df['status_putusan'].value_counts())

print(f"\nJenis Hukuman Distribution:")
print(df['jenis_hukuman'].value_counts())

# 12. Save preprocessed data
today = date.today().strftime("%Y-%m-%d")
output_path = f"/content/drive/MyDrive/CSV/preprocessed_drug_crime_cases_{today}.csv"
df.to_csv(output_path, index=False)
print(f"\nPreprocessed data saved to: {output_path}")

# 13. Create data/raw folder structure and save individual text files
if 'text_pdf' in df.columns:
    # Create data/raw folder structure
    raw_folder = "/content/drive/MyDrive/data/raw"
    os.makedirs(raw_folder, exist_ok=True)

    # Save each document as individual .txt file
    valid_text_count = 0
    for idx, text in enumerate(df['text_pdf']):
        if pd.notna(text) and str(text).strip():
            valid_text_count += 1
            case_filename = f"drug_case_{valid_text_count:03d}.txt"
            case_filepath = os.path.join(raw_folder, case_filename)

            with open(case_filepath, 'w', encoding='utf-8') as f:
                # Write document metadata as header
                f.write(f"=== DRUG CRIME CASE {valid_text_count:03d} ===\n")
                if idx < len(df):
                    f.write(f"Nomor: {df.iloc[idx].get('nomor', 'N/A')}\n")
                    f.write(f"Tahun: {df.iloc[idx].get('tahun', 'N/A')}\n")
                    f.write(f"Klasifikasi: {df.iloc[idx].get('klasifikasi', 'N/A')}\n")
                    f.write(f"Jenis Narkoba: {df.iloc[idx].get('jenis_narkoba', 'N/A')}\n")
                    f.write(f"Jenis Tindakan: {df.iloc[idx].get('jenis_tindakan', 'N/A')}\n")
                    f.write(f"Status Putusan: {df.iloc[idx].get('status_putusan', 'N/A')}\n")
                f.write("=" * 50 + "\n\n")

                # Write cleaned text content
                f.write(text)

    print(f"Raw text data saved to {raw_folder}/")
    print(f"Created {valid_text_count} individual case files (drug_case_001.txt to drug_case_{valid_text_count:03d}.txt)")

    # Create a combined file for drug crime cases
    combined_filepath = os.path.join(raw_folder, f"all_drug_cases_combined_{today}.txt")
    with open(combined_filepath, 'w', encoding='utf-8') as f:
        f.write("=== KUMPULAN KASUS TINDAK PIDANA NARKOBA ===\n")
        f.write(f"Total Kasus: {len([t for t in df['text_pdf'] if pd.notna(t) and str(t).strip()])}\n")
        f.write(f"Tanggal Preprocessing: {today}\n")
        f.write("=" * 80 + "\n\n")

        for idx, text in enumerate(df['text_pdf']):
            if pd.notna(text) and str(text).strip():
                f.write(f"=== DOKUMEN {idx+1} ===\n")
                f.write(text)
                f.write(f"\n{'='*80}\n\n")

    print(f"Combined drug crime cases file saved as: {combined_filepath}")

# Display final sample with drug-specific columns
print("\n=== SAMPLE OF PREPROCESSED DRUG CRIME DATA ===")
drug_specific_columns = ['nomor', 'tahun', 'jenis_narkoba', 'jenis_tindakan', 'status_putusan', 'jenis_hukuman', 'durasi_hukuman', 'berat_narkoba']
available_columns = [col for col in drug_specific_columns if col in df.columns]
print(df[available_columns].head())

# Display column information
print("\n=== COLUMN INFORMATION ===")
print(df.info())

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Loaded data from: putusan_ma__2025-06-20.csv
Original data shape: (64, 23)
                                               judul  \
0  Putusan PN KALIANDA Nomor 34/Pid.Sus.Anak/2016...   
1  Putusan PA JAKARTA UTARA Nomor 0288/Pdt.G/2016...   
2  Putusan PT SURABAYA Nomor 160/PDT/2015/PT SBY ...   
3  Putusan PN MEULABOH Nomor 121 / Pid.Sus / 2015...   
4  Putusan PN TEBING TINGGI Nomor 805/Pid.Sus/201...   

                           nomor tingkat_proses  \
0    34/Pid.Sus.Anak/2016/PN.Kla        Pertama   
1          0288/Pdt.G/2016/PA.JU        Pertama   
2            160/PDT/2015/PT SBY        Banding   
3  121 / Pid.Sus / 2015 / PN-Mbo        Pertama   
4        805/Pid.Sus/2016/PN TBT        Pertama   

                                 klasifikasi  \
0                        Pidana Khusus  Anak   
1                  Perdata Agama  Perceraian   
2       