In [1]:
# Step 1: Import Necessary Libraries

import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [4]:
# Step 2: Load and Prepare the Dataset
iris = load_iris()
X = iris.data
y = iris.target

print(f"Feature Names: {iris.feature_names})")
print("Target Names", iris.target_names)

Feature Names: ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)'])
Target Names ['setosa' 'versicolor' 'virginica']


In [5]:
# Step 3: Split the data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train.shape

(120, 4)

In [7]:
X_test.shape

(30, 4)

In [8]:
# Step 4: Initialize and Train the model

n_estimators = 100 # means we will build 100 decision trees
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

rf_classifier.fit(X_train, y_train)

In [9]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

y_pred=  rf_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

In [10]:
print('Accuracy score: ', accuracy)
print(classification_report(y_test, y_pred))

Accuracy score:  1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00         9
           2       1.00      1.00      1.00        11

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30



In [12]:
%pip install -U imbalanced-learn

Note: you may need to restart the kernel to use updated packages.


In [13]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, balanced_accuracy_score

X, y = make_classification(n_samples=1000, n_features=20, n_informative=5, n_redundant=0, n_classes=2, n_clusters_per_class=1, weights=[0.95, 0.05], flip_y=0, random_state=42)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

print(f"Class distribution in training set: {np.bincount(y_train)}")
print(f"Class distribution in test set: {np.bincount(y_test)}")

standard_rf = RandomForestClassifier(n_estimators=100, random_state=42)
standard_rf.fit(X_train, y_train)
y_pred_standard = standard_rf.predict(X_test)

print("--- Standard Random Forest Performance ---")
print(classification_report(y_test, y_pred_standard))

Class distribution in training set: [665  35]
Class distribution in test set: [285  15]
--- Standard Random Forest Performance ---
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       285
           1       1.00      0.60      0.75        15

    accuracy                           0.98       300
   macro avg       0.99      0.80      0.87       300
weighted avg       0.98      0.98      0.98       300



In [19]:
# 2. Random Forest with OOB Score and Class Weighting
dict_weights = {0:4, 1: 5}
weighted_rf = RandomForestClassifier(n_estimators=100, class_weight=dict_weights, oob_score=True, random_state=42)

weighted_rf.fit(X_train, y_train)

# Access the OOB Score
print(f"--- OOB Score with Weighted RF ---")
print(f"Out-of-Bag Score: {weighted_rf.oob_score_:.4f}")

# Evaluate on the test set
y_pred_weighted = weighted_rf.predict(X_test)

print("\n--- Weighted RF Performance ---")
print(classification_report(y_test, y_pred_weighted))

--- OOB Score with Weighted RF ---
Out-of-Bag Score: 0.9771

--- Weighted RF Performance ---
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       285
           1       1.00      0.60      0.75        15

    accuracy                           0.98       300
   macro avg       0.99      0.80      0.87       300
weighted avg       0.98      0.98      0.98       300



In [20]:
from imblearn.ensemble import BalancedRandomForestClassifier
# 3. Balanced RF Classifier
balanced_rf = BalancedRandomForestClassifier(n_estimators=100, random_state=42)

balanced_rf.fit(X_train, y_train)
y_pred_balanced = balanced_rf.predict(X_test)

print("\n--- BalancedRandomForestClassifier Performance ---")
print(classification_report(y_test, y_pred_balanced))


--- BalancedRandomForestClassifier Performance ---
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       285
           1       0.76      0.87      0.81        15

    accuracy                           0.98       300
   macro avg       0.88      0.93      0.90       300
weighted avg       0.98      0.98      0.98       300



In [22]:
from sklearn.datasets import make_moons
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC 

log_clf = LogisticRegression()
rnd_clf = RandomForestClassifier()
svm_clf = SVC()

X, y = make_moons(n_samples=1000, noise=0.2, random_state=42)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

In [24]:
X[:5, :]

array([[-0.11166654,  0.52022374],
       [ 1.14264982, -0.34257734],
       [ 0.79555796, -0.01144231],
       [ 0.11182668, -0.55193153],
       [-0.81646618,  0.54399604]])

In [25]:
voting_clf = VotingClassifier(
    estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],
    voting='hard'
)
voting_clf.fit(X_train, y_train)

In [26]:
from sklearn.metrics import accuracy_score
for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

LogisticRegression 0.8933333333333333
RandomForestClassifier 0.9833333333333333
SVC 0.99
VotingClassifier 0.99


**Hard Voting:** select one class by majority vote

**Soft Voting:** Averages out probability from the probability distribution of each class and vote for the highest

### Bagging and Pasting

When sampling is performed with replacement, this method is called **bagging**.

When sampling is performed without replacement, it is called
**pasting**.

In [32]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bag_clf = BaggingClassifier(
    DecisionTreeClassifier(), n_estimators=500, max_samples=100, bootstrap=True, n_jobs=-1
)

bag_clf.fit(X_train, y_train)
y_pred = bag_clf.predict(X_test)

In [33]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.97      0.98      0.98       150
           1       0.98      0.97      0.98       150

    accuracy                           0.98       300
   macro avg       0.98      0.98      0.98       300
weighted avg       0.98      0.98      0.98       300



In [34]:
y_pred_dt = DecisionTreeClassifier().fit(X_train, y_train).predict(X_test)

In [35]:
print(classification_report(y_test, y_pred_dt))

              precision    recall  f1-score   support

           0       0.95      0.98      0.96       150
           1       0.98      0.95      0.96       150

    accuracy                           0.96       300
   macro avg       0.96      0.96      0.96       300
weighted avg       0.96      0.96      0.96       300



In [36]:
bag_clf = BaggingClassifier(
    DecisionTreeClassifier(), n_estimators=500, bootstrap=True, n_jobs=-1, oob_score=True
)
bag_clf.fit(X_train, y_train)
bag_clf.oob_score_

0.9657142857142857

In [37]:
bag_clf.oob_decision_function_

array([[0.        , 1.        ],
       [0.77202073, 0.22797927],
       [0.        , 1.        ],
       ...,
       [1.        , 0.        ],
       [1.        , 0.        ],
       [0.        , 1.        ]])

### Random Patches and Random Subspaces

Random Patches Method: Sampling both training instances and features
Random Subspaces: Sampling only features

### Random Forest Classifiers

In [38]:
from sklearn.ensemble import RandomForestClassifier

rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1)
rnd_clf.fit(X_train, y_train)

y_pred_rf = rnd_clf.predict(X_test)

### Feature Importance

In [39]:
from sklearn.datasets import load_iris
iris = load_iris()
rnd_clf = RandomForestClassifier(n_estimators=500, n_jobs=-1)
rnd_clf.fit(iris['data'], iris['target'])
for name, score in zip(iris['feature_names'], rnd_clf.feature_importances_):
    print(name, score)



sepal length (cm) 0.09796623023882102
sepal width (cm) 0.02446205134911128
petal length (cm) 0.4389485209056356
petal width (cm) 0.4386231975064321


## Boosting

In [41]:
from sklearn.ensemble import AdaBoostClassifier

ada_clf = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=1), n_estimators=200, algorithm="SAMME", learning_rate=0.5
)
ada_clf.fit(X_train, y_train)

