# Overview Materi

Jelaskan secara singkat perbedaan antara bagging dan boosting menurut pemahamanmu!

Bagging; metode dalam machine learning untuk mengurangi varians dan overfitting dengan cara dataset dibagi menjadi banyak subset yang dimana satu machine learning model yang sama akan mengolah setiap subset lalu di campur hasilnya.

Boosting; metode dalam machine learning untuk mengurangi bias namun ada risk membuat overfitting model jika tidak dilakukan tepat dimana sebuah model machine learning dibuat belajar dari kesalahannya. Model akan membuahkan hasil dari dataset lalu sebuah model lain akan belajar dari kesalahannya. Ini akan diulangi terus-menurus tergantung jumlah iterasi yang diinginkan.

# Import Data & Libraries

In [23]:
# import semua libraries yang akan dibutuhkan
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import BaggingClassifier, GradientBoostingClassifier, RandomForestClassifier, GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, explained_variance_score
from collections import Counter

In [10]:
data = pd.read_csv('breast-cancer.csv')

# Misal kolom target bernama 'target'
X = data.drop('diagnosis', axis=1)  # Semua kolom kecuali 'target'
y = data['diagnosis']               # Kolom target saja

# Split data 80:20
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [11]:
# ubah data ke dalam bentuk dataframe agar bisa ditampilkan dalam .head()
target = ['radius_mean','texture_mean','perimeter_mean','area_mean','smoothness_mean','compactness_mean','concavity_mean','concave_points_mean','symmetry_mean','fractal_dimension_mean','radius_se','texture_se','perimeter_se','perimeter_se','area_se','smoothness_se']
df = pd.DataFrame(X, columns=target)

print(df.head())

NameError: name 'feature_names' is not defined

# Bagging from Scratch

## Random Forest Classifier - Sklearn

In [12]:
clf = RandomForestClassifier() # gunakan RandomForestClassifier dari Sklearn
clf.fit(X_train, y_train)
preds = clf.predict(X_test)

# print score menggunakan metrik accuracy
print("Accuracy:", accuracy_score(y_test, preds))

Accuracy: 0.9473684210526315


## Random Forest from Scratch
Source: https://www.youtube.com/watch?v=kFwe2ZZU7yw

In [None]:
class RandomForest:
    def __init__(self, n_trees=10, max_depth=10, min_samples_split=2, n_feature=None, random_state=None):
        self.n_trees = n_trees
        self.max_depth=max_depth
        self.min_samples_split=min_samples_split
        self.n_features=n_feature
        self.trees = []

    def fit(self, X, y):
        self.trees = []
        for _ in range(self.n_trees):
            tree = DecisionTree(min_samples_split=self.min_samples_split,
            n_features=self.n_features)

            X_sample, y_sample = self._bootstrap_samples(X, y)
            tree.fit(X_sample, y_sample)
            self.trees.append(tree)

    def _bootstrap_samples(self, X, y):
        n_samples = X.shape[0]
        idxs = np.random.choice(n_samples, n_samples, replace=True)
        return X[idxs], y[idxs]

    def _most_common_label(self, y):
        counter = Counter(y)
        most_common = counter.most_common(1)[0][0]
        return most_common

    def predict(self, X):
        predictions = np.array([tree.predict(X) for tree in self.trees])
        tree_preds = np.swapaxes(predictions, 0, 1)
        predictions = np.array([self._most_common_label(pred) for pred in tree_preds])

        return predictions

## Predict Using Random Forest

In [None]:
def accuracy(y_true, y_pred):
    accuracy = np.sum(y_true == y_pred) / len(y_true)
    return accuracy

clf = RandomForest()
clf.fit(X_train, y_train)
predictions = clf.predict(X_test)

# print score menggunakan metrik accuracy
print(accuracy(y_test, predictions))

# Boosting From Scratch

## Gradient Boosting Classifier - Sklearn

In [None]:
# define and train the model using GradientBoostingClassifier from Sklearn
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score

preds = GradientBoostingClassifier()
clf.fit(X_train, y_train)
preds = clf.predict(X_test)

# print score menggunakan metrik accuracy
print(accuracy_score(y_test, preds))

## Gradient Boosting from Scratch with Decision Tree
Source: https://www.youtube.com/watch?v=Pq2mmJxjs1o

In [None]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

class gradientBoostingClassifier:
    def __init__(self, lr=0.1, n_estimators=25, base_learner=DecisionTreeRegressor):
         self.lr = lr
         self.n_estimators = n_estimators
         self.base_learner = base_learner

    def fit(self, X, y, **params):
        self.base_models = []

        Fm = np.zeros_like(y, dtype=float)

        _, axs = plt.subplots(5, 5, figsize=(10, 10))
        axs = axs.flatten()

        for i in range(self.n_estimators):
            r_i = y - sigmoid(Fm)
            h_i = self.base_learner(**params)
            h_i.fit(X, r_i)
            self.base_models.append(h_i)

            # update the model
            Fm = Fm + self.lr * h_i.predict(X)

            # Plotting
            axs[i].plot(y, '.')
            axs[i].plot(Fm, '.')
            axs[i].set_title(str(i))
            axs[i].axis('off')

        plt.tight_layout()
        plt.show()

    def predict(self, X):
        Fm = np.zeros(X.shape[0])
        for h_i in self.base_models:
            Fm += self.lr * h_i.predict(X)

        probs = sigmoid(Fm)

        return (probs >= 0.5).astype(int)


## Predict Using Gradient Boosting

In [None]:
# define and train the model
model = GradientBoostingRegressor()
r = model.fit(X_train, y_train, max_depth=4)

In [None]:
# get predictions:
preds = model.predict(X_test)


# print score menggunakan metrik accuracy
score = explained_variance_score(y_test, preds)
print("score: ", score)

In [None]:
# plot predictions vs. the ground truth:
_, ax = plt.subplots(1, 1)
plt.title('test')
ax.plot(y_test, 'o', label = 'y_test')
ax.plot(preds, 'o', label = 'preds')
ax.legend()
plt.show()