In [None]:
# Klasifikasi Prospektus Obligasi - Machine Learning Approach

## Solusi untuk Circular Logic Problem

**Masalah Original:**
- Y (label kelas) ditentukan dari X (keyword counts)
- ML hanya mempelajari aturan deterministik yang sudah ada

**Solusi:**
1. **Extract fitur tambahan** dari PDF (bukan hanya keyword counts)
2. **Gunakan clustering** untuk validasi natural grouping
3. **Fitur engineering** yang lebih kaya untuk menemukan pola tersembunyi
4. **Evaluasi dengan proper metrics**

Mounted at /content/drive


In [None]:
%pip install PyPDF2 pandas scikit-learn openpyxl

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m232.6/232.6 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [1]:
# Import Libraries
import os
import re
import PyPDF2
import pandas as pd
import numpy as np
from collections import Counter

# ML Libraries
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.cluster import KMeans
from sklearn.metrics import (classification_report, confusion_matrix, 
                             accuracy_score, f1_score, silhouette_score)
from sklearn.decomposition import PCA

import warnings
warnings.filterwarnings('ignore')

print("Libraries imported successfully!")

ModuleNotFoundError: No module named 'PyPDF2'

In [None]:
# =====================================================
# FEATURE EXTRACTION - Lebih Kaya dari Keyword Count
# =====================================================

def extract_features_from_pdf(pdf_path):
    """
    Extract multiple features from PDF, not just keyword counts.
    Features:
    - Keyword counts (green, sustainability, sustainability-linked)
    - Document metadata (pages, total words, avg words per page)
    - Keyword density (normalized by document length)
    - Presence of specific phrases
    """
    features = {
        'filename': os.path.basename(pdf_path),
        # Keyword counts
        'green_bond_count': 0,
        'sustainability_bond_count': 0,
        'sustainability_linked_count': 0,
        # Document stats
        'total_pages': 0,
        'total_words': 0,
        'avg_words_per_page': 0,
        # Additional keywords
        'lingkungan_count': 0,
        'berkelanjutan_count': 0,
        'sosial_count': 0,
        'emisi_count': 0,
        'karbon_count': 0,
        'energi_terbarukan_count': 0,
        'pencapaian_count': 0,
        'indikator_kinerja_count': 0,
        # Keyword densities (per 1000 words)
        'green_density': 0,
        'sustainability_density': 0,
        'sustainability_linked_density': 0,
    }
    
    # Keywords for each class
    green_keywords = ["green bond", "green sukuk", "kubl", 
                      "berwawasan lingkungan", "efek bersifat utang berwawasan lingkungan"]
    sustainability_keywords = ["sustainability bond", "sustainability sukuk", 
                               "ebus keberlanjutan", "efek bersifat utang keberlanjutan"]
    sustainability_linked_keywords = ["sustainability linked bond", "sustainability linked sukuk",
                                      "ebus terkait keberlanjutan", "efek bersifat utang terkait keberlanjutan",
                                      "indikator kinerja utama keberlanjutan"]
    
    try:
        with open(pdf_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            features['total_pages'] = len(pdf_reader.pages)
            
            full_text = ""
            for page in pdf_reader.pages:
                text = page.extract_text() or ""
                full_text += text.lower() + " "
            
            # Word count
            words = full_text.split()
            features['total_words'] = len(words)
            features['avg_words_per_page'] = features['total_words'] / max(features['total_pages'], 1)
            
            # Count keywords
            for kw in green_keywords:
                features['green_bond_count'] += full_text.count(kw.lower())
            for kw in sustainability_keywords:
                features['sustainability_bond_count'] += full_text.count(kw.lower())
            for kw in sustainability_linked_keywords:
                features['sustainability_linked_count'] += full_text.count(kw.lower())
            
            # Additional keywords
            features['lingkungan_count'] = full_text.count('lingkungan')
            features['berkelanjutan_count'] = full_text.count('berkelanjutan')
            features['sosial_count'] = full_text.count('sosial')
            features['emisi_count'] = full_text.count('emisi')
            features['karbon_count'] = full_text.count('karbon')
            features['energi_terbarukan_count'] = full_text.count('energi terbarukan')
            features['pencapaian_count'] = full_text.count('pencapaian')
            features['indikator_kinerja_count'] = full_text.count('indikator kinerja')
            
            # Calculate densities (per 1000 words)
            if features['total_words'] > 0:
                features['green_density'] = (features['green_bond_count'] / features['total_words']) * 1000
                features['sustainability_density'] = (features['sustainability_bond_count'] / features['total_words']) * 1000
                features['sustainability_linked_density'] = (features['sustainability_linked_count'] / features['total_words']) * 1000
                
    except Exception as e:
        print(f"Error processing {pdf_path}: {e}")
    
    return features

print("Feature extraction function defined!")

In [None]:
# =====================================================
# PROCESS ALL PROSPEKTUS FILES
# =====================================================

# Directory containing prospektus files
prospektus_dir = r"d:\1. Important\Work\Bank Indonesia\DSta-DSMF\Web Scraping\Prospektus"

# Extract features from all PDFs
all_features = []
for filename in os.listdir(prospektus_dir):
    if filename.endswith('.pdf'):
        pdf_path = os.path.join(prospektus_dir, filename)
        print(f"Processing: {filename}")
        features = extract_features_from_pdf(pdf_path)
        all_features.append(features)

# Create DataFrame
df = pd.DataFrame(all_features)
print(f"\n‚úÖ Processed {len(df)} documents")
print(f"Features extracted: {len(df.columns)} columns")
df.head(10)

In [None]:
# =====================================================
# RULE-BASED LABELING (Sebagai Initial Label)
# =====================================================

def assign_label(row):
    """
    Assign label based on keyword counts - ini adalah INITIAL label
    yang akan kita validasi dengan clustering
    """
    green = row['green_bond_count']
    sustain = row['sustainability_bond_count']
    linked = row['sustainability_linked_count']
    
    # Prioritas: sustainability_linked > sustainability > green > regular
    if linked > 0 and linked >= sustain and linked >= green:
        return 'sustainability_linked_bond'
    elif sustain > 0 and sustain >= green:
        return 'sustainability_bond'
    elif green > 0:
        return 'green_bond'
    else:
        return 'obligasi_biasa'

df['initial_label'] = df.apply(assign_label, axis=1)

print("Label distribution:")
print(df['initial_label'].value_counts())
print("\n" + "="*50)

In [None]:
# =====================================================
# APPROACH 1: CLUSTERING untuk Validasi Natural Grouping
# =====================================================
# Clustering TIDAK memerlukan label - ini cara yang VALID untuk
# menggunakan ML karena kita mencari pola alami dalam data

# Select numeric features for clustering
feature_cols = ['total_pages', 'total_words', 'avg_words_per_page',
                'green_bond_count', 'sustainability_bond_count', 'sustainability_linked_count',
                'lingkungan_count', 'berkelanjutan_count', 'sosial_count',
                'emisi_count', 'karbon_count', 'energi_terbarukan_count',
                'pencapaian_count', 'indikator_kinerja_count',
                'green_density', 'sustainability_density', 'sustainability_linked_density']

X = df[feature_cols].fillna(0)

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Try different number of clusters
print("üîç Finding optimal number of clusters...")
silhouette_scores = []
for k in range(2, 8):
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    labels = kmeans.fit_predict(X_scaled)
    score = silhouette_score(X_scaled, labels)
    silhouette_scores.append((k, score))
    print(f"  K={k}: Silhouette Score = {score:.4f}")

# Best K
best_k = max(silhouette_scores, key=lambda x: x[1])[0]
print(f"\n‚úÖ Optimal clusters: {best_k}")

# Final clustering
kmeans = KMeans(n_clusters=best_k, random_state=42, n_init=10)
df['cluster'] = kmeans.fit_predict(X_scaled)

print("\nCluster distribution:")
print(df['cluster'].value_counts().sort_index())

In [None]:
# =====================================================
# COMPARE: Cluster vs Rule-Based Labels
# =====================================================
# Ini untuk melihat apakah clustering menemukan pola yang sama
# atau berbeda dengan rule-based approach

print("üîÑ Comparing Clustering Results vs Rule-Based Labels:")
print("="*60)

# Cross-tabulation
crosstab = pd.crosstab(df['cluster'], df['initial_label'])
print("\nCross-tabulation (Cluster vs Initial Label):")
print(crosstab)

# Cluster characteristics
print("\n" + "="*60)
print("üìä Cluster Characteristics (Mean values):")
print("="*60)

for cluster_id in sorted(df['cluster'].unique()):
    cluster_data = df[df['cluster'] == cluster_id]
    print(f"\nüîπ Cluster {cluster_id} ({len(cluster_data)} documents):")
    print(f"   - Avg Green Bond Keywords: {cluster_data['green_bond_count'].mean():.2f}")
    print(f"   - Avg Sustainability Keywords: {cluster_data['sustainability_bond_count'].mean():.2f}")
    print(f"   - Avg Sustainability-Linked Keywords: {cluster_data['sustainability_linked_count'].mean():.2f}")
    print(f"   - Avg Total Pages: {cluster_data['total_pages'].mean():.1f}")
    print(f"   - Most common label: {cluster_data['initial_label'].mode().values[0]}")
    
# Documents in each cluster
print("\n" + "="*60)
print("üìã Sample documents per cluster:")
for cluster_id in sorted(df['cluster'].unique()):
    print(f"\nCluster {cluster_id}:")
    sample_docs = df[df['cluster'] == cluster_id]['filename'].head(5).tolist()
    for doc in sample_docs:
        print(f"  - {doc}")

In [None]:
# =====================================================
# APPROACH 2: SUPERVISED ML dengan FITUR TAMBAHAN
# =====================================================
# Jika ada ground truth labels (dari validasi manual), ML bisa digunakan
# Untuk demo, kita gunakan initial_label tapi dengan FITUR YANG BERBEDA
# dari yang digunakan untuk membuat label

# IMPORTANT: Untuk menghindari circular logic, kita bisa:
# 1. Gunakan HANYA fitur non-keyword untuk prediksi
# 2. Atau gunakan fitur keyword sebagai "weak supervision"

# Filter: hanya dokumen yang terklasifikasi (bukan obligasi_biasa)
df_classified = df[df['initial_label'] != 'obligasi_biasa'].copy()

if len(df_classified) > 10:
    print(f"üìä Using {len(df_classified)} classified documents for supervised learning")
    
    # Feature set 1: HANYA fitur non-keyword (untuk menghindari circular logic)
    non_keyword_features = ['total_pages', 'total_words', 'avg_words_per_page',
                           'lingkungan_count', 'berkelanjutan_count', 'sosial_count',
                           'emisi_count', 'karbon_count', 'energi_terbarukan_count',
                           'pencapaian_count', 'indikator_kinerja_count']
    
    X_non_kw = df_classified[non_keyword_features].fillna(0)
    y = df_classified['initial_label']
    
    # Encode labels
    le = LabelEncoder()
    y_encoded = le.fit_transform(y)
    
    # Scale features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_non_kw)
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X_scaled, y_encoded, test_size=0.3, random_state=42, stratify=y_encoded
    )
    
    print(f"Train size: {len(X_train)}, Test size: {len(X_test)}")
    print(f"Classes: {le.classes_}")
    
    # Train multiple models dengan Cross-Validation
    models = {
        'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
        'Decision Tree': DecisionTreeClassifier(max_depth=5, random_state=42),
        'Random Forest': RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42),
        'SVM': SVC(kernel='rbf', random_state=42)
    }
    
    print("\n" + "="*60)
    print("ü§ñ Model Performance with Cross-Validation:")
    print("="*60)
    
    for name, model in models.items():
        # Cross-validation
        cv_scores = cross_val_score(model, X_train, y_train, cv=3, scoring='f1_weighted')
        
        # Train and test
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        
        test_acc = accuracy_score(y_test, y_pred)
        test_f1 = f1_score(y_test, y_pred, average='weighted')
        
        print(f"\nüìå {name}:")
        print(f"   CV F1 (train): {cv_scores.mean():.4f} ¬± {cv_scores.std():.4f}")
        print(f"   Test Accuracy: {test_acc:.4f}")
        print(f"   Test F1 (weighted): {test_f1:.4f}")
else:
    print("‚ö†Ô∏è Not enough classified documents for supervised learning")

In [None]:
# =====================================================
# APPROACH 3: Anomaly Detection - Cari Dokumen yang Mungkin Salah Klasifikasi
# =====================================================
# ML untuk menemukan dokumen yang "berbeda" dari kelompoknya
# Ini berguna untuk manual review

from sklearn.ensemble import IsolationForest

print("üîç Anomaly Detection - Finding potentially misclassified documents")
print("="*60)

# Use all features
X_full = df[feature_cols].fillna(0)
X_scaled_full = StandardScaler().fit_transform(X_full)

# Detect anomalies
iso_forest = IsolationForest(contamination=0.1, random_state=42)
df['is_anomaly'] = iso_forest.fit_predict(X_scaled_full)

anomalies = df[df['is_anomaly'] == -1]
print(f"\n‚ö†Ô∏è Found {len(anomalies)} potentially anomalous documents:")
print("-" * 60)

for _, row in anomalies.iterrows():
    print(f"\nüìÑ {row['filename']}")
    print(f"   Label: {row['initial_label']}")
    print(f"   Green: {row['green_bond_count']}, Sustain: {row['sustainability_bond_count']}, Linked: {row['sustainability_linked_count']}")
    print(f"   Pages: {row['total_pages']}, Words: {row['total_words']}")

print("\n" + "="*60)
print("üí° Dokumen anomali ini sebaiknya di-review manual untuk validasi")

In [None]:
# =====================================================
# SAVE RESULTS
# =====================================================

# Create summary DataFrame
result_df = df[['filename', 'initial_label', 'cluster', 'is_anomaly',
                'green_bond_count', 'sustainability_bond_count', 'sustainability_linked_count',
                'total_pages', 'total_words']].copy()

result_df['is_anomaly'] = result_df['is_anomaly'].map({1: 'Normal', -1: 'Anomaly'})

# Save to Excel
output_path = r"d:\1. Important\Work\Bank Indonesia\DSta-DSMF\Web Scraping\hasil_klasifikasi_ml.xlsx"
result_df.to_excel(output_path, index=False)
print(f"‚úÖ Results saved to: {output_path}")

# Display summary
print("\n" + "="*60)
print("üìä CLASSIFICATION SUMMARY")
print("="*60)
print(f"\nTotal documents: {len(df)}")
print(f"\nBy Initial Label:")
for label, count in df['initial_label'].value_counts().items():
    print(f"  - {label}: {count}")
print(f"\nBy Cluster:")
for cluster, count in df['cluster'].value_counts().sort_index().items():
    print(f"  - Cluster {cluster}: {count}")
print(f"\nAnomalies detected: {len(anomalies)}")

result_df.head(20)