In [1]:
import numpy as np
from river import datasets
from river import evaluate
from river import linear_model
from river import metrics
from river import optim
from river import preprocessing
from river import drift
import pickle
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [2]:
dataset = datasets.synth.Agrawal(
     classification_function=0,
     seed=42
)

## Model Training

#### Model 1 with 59.5% Accuracy

In [3]:
LRmodel1 = Pipeline([('scaler', StandardScaler()), ('logreg' ,LogisticRegression())])

In [4]:
# Train model with 100 samples
feature_buffer = list()
label_buffer = list()

for i,data in enumerate(dataset.take(100)):
    # data[0] - features, data[1] - label
    feature_buffer.append(list(data[0].values()))
    label_buffer.append(data[1])

In [5]:
LRmodel1.fit(feature_buffer, label_buffer)

In [6]:
drift_detector = drift.ADWIN(delta=1.2)
drifts = list()
# Initialise metric
metric = metrics.Accuracy()
print_warning = True
feature_buffer = list()
label_buffer = list()

for i,data in enumerate(dataset.take(1000)):
    # data[0] - features, data[1] - label
    y_pred = LRmodel1.predict(np.reshape(list(data[0].values()), (-1, 1)).T)
    metric = metric.update(data[1],y_pred[0])
    correctly_classified = y_pred == data[1]
    drift_detector.update(correctly_classified)   # Data is processed one sample at a time
    feature_buffer.append(list(data[0].values()))
    label_buffer.append(data[1])
    if drift_detector.drift_detected:
        # The drift detector indicates after each sample if there is a drift in the data
        print(f'Change detected at index {i}')
        drifts.append(i)

Change detected at index 95


In [7]:
metric

Accuracy: 59.50%

#### Model 2 with 67.40% Accuracy

In [8]:
# Train model with 500 samples
feature_buffer2 = list()
label_buffer2 = list()

for i,data in enumerate(dataset.take(500)):
    # data[0] - features, data[1] - label
    feature_buffer2.append(list(data[0].values()))
    label_buffer2.append(data[1])

In [9]:
LRmodel2 = Pipeline([('scaler', StandardScaler()), ('logreg' ,LogisticRegression())])

In [10]:
LRmodel2.fit(feature_buffer2, label_buffer2)

In [11]:
drift_detector = drift.ADWIN(delta=1.2)
drifts = list()
# Initialise metric
metric = metrics.Accuracy()
print_warning = True
feature_buffer = list()
label_buffer = list()

for i,data in enumerate(dataset.take(1000)):
    # data[0] - features, data[1] - label
    y_pred = LRmodel2.predict(np.reshape(list(data[0].values()), (-1, 1)).T)
    metric = metric.update(data[1],y_pred[0])
    correctly_classified = y_pred == data[1]
    drift_detector.update(correctly_classified)   # Data is processed one sample at a time
    feature_buffer.append(list(data[0].values()))
    label_buffer.append(data[1])
    if drift_detector.drift_detected:
        # The drift detector indicates after each sample if there is a drift in the data
        print(f'Change detected at index {i}')
        drifts.append(i)

Change detected at index 95


In [12]:
metric

Accuracy: 67.40%

#### Model 3 with Accuracy

In [13]:
# Train model with 200 samples
feature_buffer3 = list()
label_buffer3 = list()

for i,data in enumerate(dataset.take(200)):
    # data[0] - features, data[1] - label
    feature_buffer3.append(list(data[0].values()))
    label_buffer3.append(data[1])

In [14]:
LRmodel3 = Pipeline([('scaler', StandardScaler()), ('logreg' ,LogisticRegression())])

In [15]:
LRmodel3.fit(feature_buffer3, label_buffer3)

In [16]:
drift_detector = drift.ADWIN(delta=1.2)
drifts = list()
# Initialise metric
metric = metrics.Accuracy()
print_warning = True
feature_buffer = list()
label_buffer = list()

for i,data in enumerate(dataset.take(1000)):
    # data[0] - features, data[1] - label
    y_pred = LRmodel3.predict(np.reshape(list(data[0].values()), (-1, 1)).T)
    metric = metric.update(data[1],y_pred[0])
    correctly_classified = y_pred == data[1]
    drift_detector.update(correctly_classified)   # Data is processed one sample at a time
    feature_buffer.append(list(data[0].values()))
    label_buffer.append(data[1])
    if drift_detector.drift_detected:
        # The drift detector indicates after each sample if there is a drift in the data
        print(f'Change detected at index {i}')
        drifts.append(i)

Change detected at index 95


In [17]:
metric

Accuracy: 65.10%

Saving three models with Accuracy scores:
- Model1:  59.50%
- Model2:  67.40% 
- Model3:  65.10%

## Loading saved models and benchmarking them

In [18]:
LRmodel1 = pickle.load(open('../saved_models/classification/sklearn/LRmodel1.pkl', 'rb'))

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [19]:
# Initialize drift detector
drift_detector = drift.ADWIN(delta=1.2)
drifts = list()

# Initialise metric
metric = metrics.Accuracy()
print_warning = True
feature_buffer = list()
label_buffer = list()

for i,data in enumerate(dataset.take(1000)):
    # data[0] - features, data[1] - label
    y_pred = LRmodel1.predict(np.reshape(list(data[0].values()), (-1, 1)).T)
    metric = metric.update(data[1],y_pred[0])
    correctly_classified = y_pred == data[1]
    drift_detector.update(correctly_classified)   # Data is processed one sample at a time
    feature_buffer.append(list(data[0].values()))
    label_buffer.append(data[1])
    if drift_detector.drift_detected:
        # The drift detector indicates after each sample if there is a drift in the data
        print(f'Change detected at index {i}')
        drifts.append(i)
print(metric)

Change detected at index 95
Accuracy: 59.50%


In [20]:
LRmodel2 = pickle.load(open('../saved_models/classification/sklearn/LRmodel2.pkl', 'rb'))

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [21]:
drift_detector = drift.ADWIN(delta=1.2)
drifts = list()
# Initialise metric
metric = metrics.Accuracy()
print_warning = True
feature_buffer = list()
label_buffer = list()

for i,data in enumerate(dataset.take(1000)):
    # data[0] - features, data[1] - label
    y_pred = LRmodel2.predict(np.reshape(list(data[0].values()), (-1, 1)).T)
    metric = metric.update(data[1],y_pred[0])
    correctly_classified = y_pred == data[1]
    drift_detector.update(correctly_classified)   # Data is processed one sample at a time
    feature_buffer.append(list(data[0].values()))
    label_buffer.append(data[1])
    if drift_detector.drift_detected:
        # The drift detector indicates after each sample if there is a drift in the data
        print(f'Change detected at index {i}')
        drifts.append(i)
print(metric)

Change detected at index 95
Accuracy: 67.40%


In [22]:
LRmodel3 = pickle.load(open('../saved_models/classification/sklearn/LRmodel3.pkl', 'rb'))

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [23]:
drift_detector = drift.ADWIN(delta=1.2)
drifts = list()
# Initialise metric
metric = metrics.Accuracy()
print_warning = True
feature_buffer = list()
label_buffer = list()

for i,data in enumerate(dataset.take(1000)):
    # data[0] - features, data[1] - label
    y_pred = LRmodel3.predict(np.reshape(list(data[0].values()), (-1, 1)).T)
    metric = metric.update(data[1],y_pred[0])
    correctly_classified = y_pred == data[1]
    drift_detector.update(correctly_classified)   # Data is processed one sample at a time
    feature_buffer.append(list(data[0].values()))
    label_buffer.append(data[1])
    if drift_detector.drift_detected:
        # The drift detector indicates after each sample if there is a drift in the data
        print(f'Change detected at index {i}')
        drifts.append(i)
print(metric)

Change detected at index 95
Accuracy: 65.10%


In [24]:
PretrainedModels = dict()

In [25]:
PretrainedModels["model1"]=dict()
PretrainedModels["model2"]=dict()
PretrainedModels["model3"]=dict()

In [26]:
PretrainedModels["model1"]["name"]="../saved_models/classification/sklearn/LRmodel1.pkl"
PretrainedModels["model2"]["name"]="../saved_models/classification/sklearn/LRmodel2.pkl"
PretrainedModels["model3"]["name"]="../saved_models/classification/sklearn/LRmodel3.pkl"

In [27]:
PretrainedModels["model1"]["accuracy"]="59.50%"
PretrainedModels["model2"]["accuracy"]="67.40%"
PretrainedModels["model3"]["accuracy"]="65.10%"

In [28]:
PretrainedModels

{'model1': {'name': '../saved_models/classification/sklearn/LRmodel1.pkl',
  'accuracy': '59.50%'},
 'model2': {'name': '../saved_models/classification/sklearn/LRmodel2.pkl',
  'accuracy': '67.40%'},
 'model3': {'name': '../saved_models/classification/sklearn/LRmodel3.pkl',
  'accuracy': '65.10%'}}

In [29]:
PretrainedModels['model1']['name']

'../saved_models/classification/sklearn/LRmodel1.pkl'

In [30]:
LRmodel1 = pickle.load(open(PretrainedModels['model1']['name'], 'rb'))
LRmodel1

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [31]:
LRmodel = pickle.load(open(PretrainedModels['model1']['name'], 'rb'))

drift_detector = drift.ADWIN(delta=0.99)
print(drift_detector)
drifts = list()
correctly_classified_data = list()
feature_buffer = list()
label_buffer = list()

for i,data in enumerate(dataset.take(16000)):
    y_pred = LRmodel.predict(np.reshape(list(data[0].values()), (-1, 1)).T)
    metric = metric.update(data[1],y_pred[0]) # Accuracy
    correctly_classified = y_pred[0]==data[1]  # checking accuracy
    correctly_classified_data.append(correctly_classified)
    drift_detector.update(correctly_classified) 
    feature_buffer.append(list(data[0].values()))
    label_buffer.append(data[1])
    if drift_detector.drift_detected:
        print(f'Change detected at index {i}')
        drifts.append(i)
        print(f'Using pre-trained model with better accuracy')
        LRmodel = pickle.load(open(PretrainedModels['model2']['name'], 'rb'))
        break

ADWIN
Change detected at index 95
Using pre-trained model with better accuracy


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [32]:
metric

Accuracy: 64.96%