In [2]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier

In [3]:
# ---------------------------
# Sample initial dataset
# ---------------------------
df = pd.DataFrame({
    'Feature1': [2, 5, 2, 2, 2],
    'Feature2': [20, 50, 20, 20, 20],
    'target': [1, 0, 1, 1, 1],
})

In [4]:
# Initialize equal weights
df['weights'] = 1 / len(df)

In [5]:
# ---------------------------
# Function to compute cumulative sums for weighted sampling
# ---------------------------
def compute_cumsum(df):
    df['normalized_weights'] = df['weights'] / df['weights'].sum()
    df['cumsum_upper'] = np.cumsum(df['normalized_weights'])
    df['cumsum_lower'] = df['cumsum_upper'].shift(1, fill_value=0)
    return df

In [6]:
# ---------------------------
# Weighted resampling function
# ---------------------------
def create_new_dataset(df, n_samples=None):
    if n_samples is None:
        n_samples = len(df)
    indices = []
    for _ in range(n_samples):
        a = np.random.random()
        for index, row in df.iterrows():
            if row['cumsum_lower'] <= a < row['cumsum_upper']:
                indices.append(index)
                break
    return indices

In [7]:
# ---------------------------
# AdaBoost-style resampling loop
# ---------------------------
max_iterations = 10
iteration = 0
datasets = []

while iteration < max_iterations:
    iteration += 1
    print(f"\nIteration {iteration}:")

    # Step 1: Compute cumulative sums
    df = compute_cumsum(df)

    # Step 2: Sample new dataset according to weights
    index_values = create_new_dataset(df)
    new_df = df.loc[index_values, ['Feature1', 'Feature2', 'target', 'weights']].reset_index(drop=True)
    datasets.append(new_df)

    # Step 3: Train a weak learner (decision stump) using current weights
    clf = DecisionTreeClassifier(max_depth=1)
    clf.fit(new_df[['Feature1','Feature2']], new_df['target'], sample_weight=new_df['weights'])

    # Step 4: Predict on the same dataset
    new_df['y_pred'] = clf.predict(new_df[['Feature1','Feature2']])

    # Step 5: Identify misclassified points
    misclassified = new_df[new_df['y_pred'] != new_df['target']]
    print("Number of misclassified points:", len(misclassified))

    # Stop if no misclassifications
    if len(misclassified) == 0:
        print("No misclassifications — stopping resampling.")
        break

    # Step 6: Update weights for next iteration (boosting formula)
    # Increase weights for misclassified points
    alpha = 0.5  # learning rate for weight adjustment
    df['weights'] = df['weights'] * np.exp(alpha * (df['target'] != new_df['y_pred']).astype(float))


Iteration 1:
Number of misclassified points: 0
No misclassifications — stopping resampling.


In [8]:
# ---------------------------
# Final output
# ---------------------------
print("\nFinal datasets:")
for i, d in enumerate(datasets):
    print(f"\nDataset {i+1}:")
    print(d)


Final datasets:

Dataset 1:
   Feature1  Feature2  target  weights  y_pred
0         2        20       1      0.2       1
1         2        20       1      0.2       1
2         2        20       1      0.2       1
3         2        20       1      0.2       1
4         5        50       0      0.2       0
