In [18]:
# Import library yang diperlukan
import numpy as np
import pandas as pd
from sklearn.ensemble import (RandomForestClassifier, VotingClassifier,
                            BaggingClassifier, AdaBoostClassifier,
                            GradientBoostingRegressor, RandomForestRegressor)
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, mean_squared_error, classification_report
from sklearn.datasets import make_classification, make_regression
import xgboost as xgb

# ===== CONTOH DATA =====
# Membuat dataset contoh untuk klasifikasi
X_class, y_class = make_classification(n_samples=1000, n_features=20,
                                     n_informative=15, n_redundant=5,
                                     random_state=42)
X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(
    X_class, y_class, test_size=0.2, random_state=42)

# Membuat dataset contoh untuk regresi
X_reg, y_reg = make_regression(n_samples=1000, n_features=20,
                              noise=0.1, random_state=42)
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(
    X_reg, y_reg, test_size=0.2, random_state=42)

print("Dataset berhasil dibuat!")
print(f"Klasifikasi - Train: {X_train_class.shape}, Test: {X_test_class.shape}")
print(f"Regresi - Train: {X_train_reg.shape}, Test: {X_test_reg.shape}")
print("="*60)

# ===== 1. VOTING CLASSIFIER =====
print("1. VOTING CLASSIFIER")
print("-" * 30)

# Inisialisasi model individual
log_clf = LogisticRegression(random_state=42)
rnd_clf = RandomForestClassifier(n_estimators=100, random_state=42)
svm_clf = SVC(probability=True, random_state=42)  # probability=True untuk soft voting

# Hard Voting
voting_clf_hard = VotingClassifier(
    estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],
    voting='hard'
)
voting_clf_hard.fit(X_train_class, y_train_class)
y_pred_hard = voting_clf_hard.predict(X_test_class)
accuracy_hard = accuracy_score(y_test_class, y_pred_hard)
print(f"Hard Voting Accuracy: {accuracy_hard:.4f}")

# Soft Voting
voting_clf_soft = VotingClassifier(
    estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],
    voting='soft'
)
voting_clf_soft.fit(X_train_class, y_train_class)
y_pred_soft = voting_clf_soft.predict(X_test_class)
accuracy_soft = accuracy_score(y_test_class, y_pred_soft)
print(f"Soft Voting Accuracy: {accuracy_soft:.4f}")

# Bandingkan dengan model individual
for clf_name, clf in voting_clf_soft.named_estimators_.items():
    clf.fit(X_train_class, y_train_class)
    y_pred_individual = clf.predict(X_test_class)
    accuracy_individual = accuracy_score(y_test_class, y_pred_individual)
    print(f"{clf_name} Individual Accuracy: {accuracy_individual:.4f}")

print("="*60)

# ===== 2. BAGGING CLASSIFIER =====
print("2. BAGGING CLASSIFIER")
print("-" * 30)

# Bagging dengan Decision Tree
bag_clf = BaggingClassifier(
    DecisionTreeClassifier(random_state=42),
    n_estimators=500,
    max_samples=100,
    bootstrap=True,
    n_jobs=-1,
    random_state=42
)
bag_clf.fit(X_train_class, y_train_class)
y_pred_bag = bag_clf.predict(X_test_class)
accuracy_bag = accuracy_score(y_test_class, y_pred_bag)
print(f"Bagging Classifier Accuracy: {accuracy_bag:.4f}")

# Bandingkan dengan single Decision Tree
single_tree = DecisionTreeClassifier(random_state=42)
single_tree.fit(X_train_class, y_train_class)
y_pred_single = single_tree.predict(X_test_class)
accuracy_single = accuracy_score(y_test_class, y_pred_single)
print(f"Single Decision Tree Accuracy: {accuracy_single:.4f}")
print(f"Improvement: {accuracy_bag - accuracy_single:.4f}")

print("="*60)

# ===== 3. RANDOM FOREST =====
print("3. RANDOM FOREST")
print("-" * 30)

# Random Forest Classifier
rnd_clf = RandomForestClassifier(
    n_estimators=500,
    max_leaf_nodes=16,
    n_jobs=-1,
    random_state=42
)
rnd_clf.fit(X_train_class, y_train_class)
y_pred_rf = rnd_clf.predict(X_test_class)
accuracy_rf = accuracy_score(y_test_class, y_pred_rf)
print(f"Random Forest Accuracy: {accuracy_rf:.4f}")

# Feature importance
feature_importance = rnd_clf.feature_importances_
print("Top 5 Most Important Features:")
for i, importance in enumerate(sorted(enumerate(feature_importance),
                                    key=lambda x: x[1], reverse=True)[:5]):
    print(f"Feature {importance[0]}: {importance[1]:.4f}")

# Extra Trees (Extremely Randomized Trees)
bag_clf_extra = BaggingClassifier(
    DecisionTreeClassifier(splitter="random", max_leaf_nodes=16, random_state=42),
    n_estimators=500,
    max_samples=1.0,
    bootstrap=True,
    n_jobs=-1,
    random_state=42
)
bag_clf_extra.fit(X_train_class, y_train_class)
y_pred_extra = bag_clf_extra.predict(X_test_class)
accuracy_extra = accuracy_score(y_test_class, y_pred_extra)
print(f"Extra Trees Accuracy: {accuracy_extra:.4f}")

print("="*60)

# ===== 4. ADABOOST =====
print("4. ADABOOST")
print("-" * 30)

ada_clf = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=1, random_state=42),
    n_estimators=200,
    algorithm="SAMME",
    learning_rate=0.5,
    random_state=42
)
ada_clf.fit(X_train_class, y_train_class)
y_pred_ada = ada_clf.predict(X_test_class)
accuracy_ada = accuracy_score(y_test_class, y_pred_ada)
print(f"AdaBoost Accuracy: {accuracy_ada:.4f}")

print("="*60)

# ===== 5. GRADIENT BOOSTING (Manual Implementation) =====
print("5. GRADIENT BOOSTING - Manual Implementation")
print("-" * 30)

# Manual Gradient Boosting untuk Regresi
def manual_gradient_boosting(X, y, X_new, max_depth=2, n_estimators=3):
    """
    Implementasi manual gradient boosting
    """
    trees = []
    predictions = []

    # First tree
    tree_reg1 = DecisionTreeRegressor(max_depth=max_depth, random_state=42)
    tree_reg1.fit(X, y)
    trees.append(tree_reg1)

    # Residual untuk tree kedua
    y_residual = y - tree_reg1.predict(X)
    tree_reg2 = DecisionTreeRegressor(max_depth=max_depth, random_state=42)
    tree_reg2.fit(X, y_residual)
    trees.append(tree_reg2)

    # Residual untuk tree ketiga
    y_residual2 = y_residual - tree_reg2.predict(X)
    tree_reg3 = DecisionTreeRegressor(max_depth=max_depth, random_state=42)
    tree_reg3.fit(X, y_residual2)
    trees.append(tree_reg3)

    # Prediksi final
    y_pred = sum(tree.predict(X_new) for tree in trees)
    return y_pred, trees

# Test manual implementation
y_pred_manual, manual_trees = manual_gradient_boosting(
    X_train_reg, y_train_reg, X_test_reg)
mse_manual = mean_squared_error(y_test_reg, y_pred_manual)
print(f"Manual Gradient Boosting MSE: {mse_manual:.4f}")

print("="*60)

# ===== 6. GRADIENT BOOSTING (Sklearn) =====
print("6. GRADIENT BOOSTING - Sklearn Implementation")
print("-" * 30)

# Basic Gradient Boosting
gbrt = GradientBoostingRegressor(
    max_depth=2,
    n_estimators=100,
    learning_rate=0.1,
    random_state=42
)
gbrt.fit(X_train_reg, y_train_reg)
y_pred_gbrt = gbrt.predict(X_test_reg)
mse_gbrt = mean_squared_error(y_test_reg, y_pred_gbrt)
print(f"Gradient Boosting MSE: {mse_gbrt:.4f}")

# Gradient Boosting dengan Early Stopping
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
    X_train_reg, y_train_reg, test_size=0.2, random_state=42)

gbrt_early = GradientBoostingRegressor(
    max_depth=2,
    n_estimators=200,
    learning_rate=0.1,
    random_state=42
)
gbrt_early.fit(X_train_split, y_train_split)

# Mencari estimator terbaik
errors = [mean_squared_error(y_val_split, y_pred)
          for y_pred in gbrt_early.staged_predict(X_val_split)]
best_n_estimators = np.argmin(errors) + 1

print(f"Best number of estimators: {best_n_estimators}")

# Training ulang dengan estimator terbaik
gbrt_best = GradientBoostingRegressor(
    max_depth=2,
    n_estimators=best_n_estimators,
    learning_rate=0.1,
    random_state=42
)
gbrt_best.fit(X_train_reg, y_train_reg)
y_pred_best = gbrt_best.predict(X_test_reg)
mse_best = mean_squared_error(y_test_reg, y_pred_best)
print(f"Optimized Gradient Boosting MSE: {mse_best:.4f}")

# Early Stopping dengan warm_start
gbrt_warm = GradientBoostingRegressor(
    max_depth=2,
    warm_start=True,
    learning_rate=0.1,
    random_state=42
)

min_val_error = float("inf")
error_going_up = 0
best_estimators_warm = 0

for n_estimators in range(1, 200):
    gbrt_warm.n_estimators = n_estimators
    gbrt_warm.fit(X_train_split, y_train_split)
    y_pred_val = gbrt_warm.predict(X_val_split)
    val_error = mean_squared_error(y_val_split, y_pred_val)

    if val_error < min_val_error:
        min_val_error = val_error
        error_going_up = 0
        best_estimators_warm = n_estimators
    else:
        error_going_up += 1
        if error_going_up == 5:  # early stopping
            break

print(f"Early stopping at {best_estimators_warm} estimators")
print(f"Best validation error: {min_val_error:.4f}")

print("="*60)

# ===== 7. XGBOOST =====
print("7. XGBOOST")
print("-" * 30)

# Basic XGBoost
xgb_reg = xgb.XGBRegressor(random_state=42)
xgb_reg.fit(X_train_reg, y_train_reg)
y_pred_xgb = xgb_reg.predict(X_test_reg)
mse_xgb = mean_squared_error(y_test_reg, y_pred_xgb)
print(f"XGBoost MSE: {mse_xgb:.4f}")

# XGBoost dengan Early Stopping (versi baru)
try:
    # Untuk XGBoost versi baru (>= 1.6.0)
    xgb_reg_early = xgb.XGBRegressor(
        n_estimators=200,
        learning_rate=0.1,
        early_stopping_rounds=10,
        eval_metric='rmse',
        random_state=42
    )
    xgb_reg_early.fit(
        X_train_split, y_train_split,
        eval_set=[(X_val_split, y_val_split)],
        verbose=False
    )
    y_pred_xgb_early = xgb_reg_early.predict(X_test_reg)
    mse_xgb_early = mean_squared_error(y_test_reg, y_pred_xgb_early)
    print(f"XGBoost with Early Stopping MSE: {mse_xgb_early:.4f}")
    try:
        print(f"Best iteration: {xgb_reg_early.best_iteration}")
    except:
        print("Best iteration info not available")

except Exception as e:
    # Fallback untuk XGBoost versi lama
    print(f"XGBoost early stopping error: {e}")
    print("Using basic XGBoost without early stopping...")
    xgb_reg_early = xgb.XGBRegressor(
        n_estimators=100,
        learning_rate=0.1,
        random_state=42
    )
    xgb_reg_early.fit(X_train_split, y_train_split)
    y_pred_xgb_early = xgb_reg_early.predict(X_test_reg)
    mse_xgb_early = mean_squared_error(y_test_reg, y_pred_xgb_early)
    print(f"XGBoost (fallback) MSE: {mse_xgb_early:.4f}")

print("="*60)

# ===== SUMMARY PERBANDINGAN =====
print("SUMMARY - MODEL COMPARISON")
print("-" * 40)
print("CLASSIFICATION MODELS:")
print(f"Hard Voting:        {accuracy_hard:.4f}")
print(f"Soft Voting:        {accuracy_soft:.4f}")
print(f"Bagging:            {accuracy_bag:.4f}")
print(f"Random Forest:      {accuracy_rf:.4f}")
print(f"Extra Trees:        {accuracy_extra:.4f}")
print(f"AdaBoost:           {accuracy_ada:.4f}")
print(f"Single Tree:        {accuracy_single:.4f}")

print("\nREGRESSION MODELS (MSE):")
print(f"Manual Gradient Boosting:    {mse_manual:.4f}")
print(f"Gradient Boosting:           {mse_gbrt:.4f}")
print(f"Optimized Gradient Boosting: {mse_best:.4f}")
print(f"XGBoost:                     {mse_xgb:.4f}")
print(f"XGBoost Early Stopping:      {mse_xgb_early:.4f}")

print("\n" + "="*60)
print("KODE SELESAI - SEMUA MODEL BERHASIL DILATIH!")

Dataset berhasil dibuat!
Klasifikasi - Train: (800, 20), Test: (200, 20)
Regresi - Train: (800, 20), Test: (200, 20)
1. VOTING CLASSIFIER
------------------------------
Hard Voting Accuracy: 0.9100
Soft Voting Accuracy: 0.9100
lr Individual Accuracy: 0.8250
rf Individual Accuracy: 0.9000
svc Individual Accuracy: 0.9350
2. BAGGING CLASSIFIER
------------------------------
Bagging Classifier Accuracy: 0.8750
Single Decision Tree Accuracy: 0.7900
Improvement: 0.0850
3. RANDOM FOREST
------------------------------
Random Forest Accuracy: 0.8700
Top 5 Most Important Features:
Feature 12: 0.1731
Feature 2: 0.0930
Feature 5: 0.0727
Feature 17: 0.0694
Feature 6: 0.0686
Extra Trees Accuracy: 0.8700
4. ADABOOST
------------------------------




AdaBoost Accuracy: 0.8100
5. GRADIENT BOOSTING - Manual Implementation
------------------------------
Manual Gradient Boosting MSE: 20303.1623
6. GRADIENT BOOSTING - Sklearn Implementation
------------------------------
Gradient Boosting MSE: 3964.4760
Best number of estimators: 200
Optimized Gradient Boosting MSE: 2044.9120
Early stopping at 199 estimators
Best validation error: 2931.7191
7. XGBOOST
------------------------------
XGBoost MSE: 5530.4742
XGBoost with Early Stopping MSE: 5498.4452
Best iteration: 198
SUMMARY - MODEL COMPARISON
----------------------------------------
CLASSIFICATION MODELS:
Hard Voting:        0.9100
Soft Voting:        0.9100
Bagging:            0.8750
Random Forest:      0.8700
Extra Trees:        0.8700
AdaBoost:           0.8100
Single Tree:        0.7900

REGRESSION MODELS (MSE):
Manual Gradient Boosting:    20303.1623
Gradient Boosting:           3964.4760
Optimized Gradient Boosting: 2044.9120
XGBoost:                     5530.4742
XGBoost Early St