In [17]:
import numpy as np
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split

In [2]:
# Step 1: Generate a synthetic customer churn dataset
# Features could include purchase frequency, average order value, customer support interactions, etc.
X, y = make_classification(n_samples=1000, n_features=10, n_informative=5, n_redundant=2, 
                           weights=[0.8, 0.2], flip_y=0, random_state=42)

In [3]:
# Initial labeled dataset (small set of customers with known churn outcomes)
X_train, X_pool, y_train, y_pool = train_test_split(X, y, train_size=0.1, random_state=42, stratify=y)

In [4]:
X_train.shape

(100, 10)

In [5]:
X_pool.shape

(900, 10)

In [6]:
# Step 2: Train an initial churn prediction model on the small labeled dataset
model = RandomForestClassifier(random_state=42, class_weight="balanced")
model.fit(X_train, y_train)

In [7]:
# Step 3: Active Learning Loop
n_queries = 10  # Number of queries (customers to be labeled)

# Test

In [9]:
X_pool.shape
probs = model.predict_proba(X_pool)
probs

array([[0.99, 0.01],
       [0.99, 0.01],
       [0.93, 0.07],
       ...,
       [0.99, 0.01],
       [0.21, 0.79],
       [0.82, 0.18]])

In [10]:
# Select the customer for which the model is most uncertain (uncertainty sampling)
uncertainty = np.max(probs, axis=1)  # The lower the max probability, the higher the uncertainty
uncertainty

array([0.99, 0.99, 0.93, 0.82, 0.75, 0.64, 0.82, 0.81, 0.97, 0.85, 0.84,
       0.99, 0.98, 0.59, 0.99, 0.89, 0.88, 0.93, 0.96, 0.84, 0.91, 0.92,
       0.97, 1.  , 0.97, 0.98, 0.76, 0.83, 0.9 , 0.65, 0.51, 0.8 , 0.66,
       0.64, 0.95, 0.98, 0.89, 0.5 , 0.78, 0.99, 0.82, 0.87, 0.69, 0.74,
       0.52, 0.63, 0.94, 0.99, 0.91, 0.54, 0.6 , 0.97, 0.51, 0.79, 0.82,
       0.83, 0.9 , 0.96, 0.93, 0.98, 0.62, 0.99, 0.89, 0.98, 0.75, 0.98,
       0.98, 1.  , 0.94, 0.96, 0.99, 0.79, 0.98, 0.94, 0.78, 0.99, 0.55,
       0.94, 0.99, 0.92, 0.99, 0.67, 0.95, 0.88, 0.98, 0.88, 0.89, 0.89,
       0.82, 0.88, 0.79, 0.72, 0.96, 0.84, 0.83, 0.97, 0.96, 0.96, 0.59,
       0.56, 0.74, 0.53, 0.86, 0.98, 0.83, 0.81, 0.98, 0.99, 0.93, 0.96,
       0.86, 0.98, 0.81, 0.98, 0.78, 0.52, 0.81, 0.95, 0.67, 0.83, 0.97,
       0.77, 0.95, 0.97, 0.57, 0.98, 0.81, 0.77, 0.7 , 0.68, 0.9 , 0.99,
       0.93, 0.95, 0.79, 0.95, 0.82, 0.72, 0.93, 0.98, 0.78, 0.83, 0.89,
       0.98, 0.98, 0.98, 0.91, 0.84, 0.99, 1.  , 0.

In [14]:
y_pool

array([0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0,
       0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1,

In [11]:
query_idx = np.argmin(uncertainty)
query_idx

np.int64(37)

In [12]:
probs[query_idx]

array([0.5, 0.5])

In [13]:
# Simulate the labeling process (e.g., manual investigation of customer data)
X_query, y_query = X_pool[query_idx].reshape(1, -1), y_pool[query_idx].reshape(1, )

print(X_query)
print(y_query)

[[ 0.31978999 -1.76858268  3.85597173  0.67179917  1.62830732 -1.00972351
  -1.18774466 -1.44355703  1.16034818 -1.09235447]]
[0]


In [15]:
# Add the selected customer to the training set
X_train = np.vstack([X_train, X_query])
y_train = np.concatenate([y_train, y_query])

In [16]:
# Remove the selected customer from the pool
X_pool = np.delete(X_pool, query_idx, axis=0)
y_pool = np.delete(y_pool, query_idx, axis=0)

# Running iterations

In [18]:
X_train, X_pool, y_train, y_pool = train_test_split(X, y, train_size=0.1, random_state=42, stratify=y)

In [19]:
for _ in range(n_queries):
    # Predict probabilities on the unlabeled customer pool
    probs = model.predict_proba(X_pool)
    
    # Select the customer for which the model is most uncertain (uncertainty sampling)
    uncertainty = np.max(probs, axis=1)  # The lower the max probability, the higher the uncertainty
    query_idx = np.argmin(uncertainty)
    
    # Simulate the labeling process (e.g., manual investigation of customer data)
    X_query, y_query = X_pool[query_idx].reshape(1, -1), y_pool[query_idx].reshape(1, )
    
    # Add the selected customer to the training set
    X_train = np.vstack([X_train, X_query])
    y_train = np.concatenate([y_train, y_query])
    
    # Remove the selected customer from the pool
    X_pool = np.delete(X_pool, query_idx, axis=0)
    y_pool = np.delete(y_pool, query_idx, axis=0)
    
    # Retrain the churn prediction model on the updated training set
    model.fit(X_train, y_train)
    
    # Print the current training accuracy
    y_pred_train = model.predict(X_train)
    print(f"Iteration {_ + 1}, Training Accuracy: {accuracy_score(y_train, y_pred_train):.4f}")
    print(f"Iteration {_ + 1}, Training f1-score: {f1_score(y_train, y_pred_train):.4f}")

Iteration 1, Training Accuracy: 1.0000
Iteration 1, Training f1-score: 1.0000
Iteration 2, Training Accuracy: 1.0000
Iteration 2, Training f1-score: 1.0000
Iteration 3, Training Accuracy: 1.0000
Iteration 3, Training f1-score: 1.0000
Iteration 4, Training Accuracy: 1.0000
Iteration 4, Training f1-score: 1.0000
Iteration 5, Training Accuracy: 1.0000
Iteration 5, Training f1-score: 1.0000
Iteration 6, Training Accuracy: 1.0000
Iteration 6, Training f1-score: 1.0000
Iteration 7, Training Accuracy: 1.0000
Iteration 7, Training f1-score: 1.0000
Iteration 8, Training Accuracy: 1.0000
Iteration 8, Training f1-score: 1.0000
Iteration 9, Training Accuracy: 1.0000
Iteration 9, Training f1-score: 1.0000
Iteration 10, Training Accuracy: 1.0000
Iteration 10, Training f1-score: 1.0000


In [10]:
# Final accuracy on the entire dataset (for evaluation purposes)
y_pred = model.predict(X)
print(f"Final Accuracy on the entire dataset: {accuracy_score(y, y_pred):.4f}")

Final Accuracy on the entire dataset: 0.8990
