In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
  import pandas as pd
  import numpy as np
  import seaborn as sns
  import matplotlib.pyplot as plt

In [45]:
import numpy as np
import pandas as pd

def generate_drifting_sea_autoregressive(n_samples=50000, noise=0.2, drift_points=[15000, 30000], seed=42):
    np.random.seed(seed)

    f1 = np.random.uniform(0, 10, n_samples)
    f2 = np.random.uniform(0, 10, n_samples)
    y = np.zeros(n_samples)

    # Define coefficients per concept
    concepts = [
        {'a1': 0.5, 'a2': 0.5, 'ar1': 0.7, 'ar2': -0.1},  # Concept 1
        {'a1': 1.2, 'a2': -0.3, 'ar1': 0.4, 'ar2': -0.2}, # Concept 2
        {'a1': -0.5, 'a2': 0.9, 'ar1': 0.1, 'ar2': 0.5},  # Concept 3
    ]

    concept_idx = 0
    for t in range(2, n_samples):
        if concept_idx + 1 < len(drift_points) and t > drift_points[concept_idx]:
            concept_idx += 1

        c = concepts[concept_idx]
        y[t] = (
            c['a1'] * f1[t] +
            c['a2'] * f2[t] +
            c['ar1'] * y[t - 1] +
            c['ar2'] * y[t - 2] +
            np.random.normal(0, noise)
        )

    df = pd.DataFrame({
        'f1': f1,
        'f2': f2,
        'target': y
    })

    # Add synthetic datetime
    df['Date_Time'] = pd.date_range(start="2020-01-01", periods=n_samples, freq='T')
    df.set_index('Date_Time', inplace=True)

    return df

# Generate the dataset
df_sea_drift = generate_drifting_sea_autoregressive()

# Save to CSV
file_path = "/content/drive/MyDrive/Re-Submissions/MAAR/SEA_autoregressive_stream2.csv"
df_sea_drift.to_csv(file_path)

file_path


'/content/drive/MyDrive/Re-Submissions/MAAR/SEA_autoregressive_stream2.csv'

In [70]:
src_minute = "/content/drive/MyDrive/Re-Submissions/MAAR/SEA_autoregressive_stream.csv"
src_minute

'/content/drive/MyDrive/Re-Submissions/MAAR/SEA_autoregressive_stream.csv'

In [47]:
# Load SEA dataset (no datetime column yet)
df_minute = pd.read_csv(src_minute)

# Create synthetic datetime index (1-minute intervals)
start_time = pd.to_datetime("2020-01-01 00:00:00")  # Arbitrary start date
df_minute['Date_Time'] = pd.date_range(start=start_time, periods=len(df_minute), freq='T')

# Set index to Date_Time for compatibility with old code
df_minute.set_index('Date_Time', drop=False, inplace=True)

df_minute.head()


Unnamed: 0_level_0,f1,f2,target,Date_Time
Date_Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-01-01 00:00:00,3.745401,8.472366,0.0,2020-01-01 00:00:00
2020-01-01 00:01:00,9.507143,4.94517,0.0,2020-01-01 00:01:00
2020-01-01 00:02:00,7.319939,1.954656,4.80584,2020-01-01 00:02:00
2020-01-01 00:03:00,5.986585,7.366418,10.545367,2020-01-01 00:03:00
2020-01-01 00:04:00,1.560186,4.186781,9.75463,2020-01-01 00:04:00


In [71]:
newdf = df_minute[['target']]
newdf

Unnamed: 0_level_0,target
Date_Time,Unnamed: 1_level_1
2020-01-01 00:00:00,0.000000
2020-01-01 00:01:00,0.000000
2020-01-01 00:02:00,4.805840
2020-01-01 00:03:00,10.545367
2020-01-01 00:04:00,9.754630
...,...
2020-02-04 17:15:00,11.939940
2020-02-04 17:16:00,11.099502
2020-02-04 17:17:00,11.033651
2020-02-04 17:18:00,9.761370


In [49]:
newdf.shape

(50000, 1)

In [50]:
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
    n_vars = 1 if type(data) is list else data.shape[1]
    dff = pd.DataFrame(data)
    cols, names = list(), list()
    for i in range(n_in, 0, -1):
        cols.append(dff.shift(-i))
        names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
    for i in range(0, n_out):
        cols.append(dff.shift(-i))
        if i==0:
            names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
        else:
            names += [('var%d(t+%d)' % (j+1)) for j in range(n_vars)]
        agg = pd.concat(cols, axis=1)
        agg.columns = names
        if dropnan:
            agg.dropna(inplace=True)
        return agg

In [51]:
reframed = series_to_supervised(newdf.values, 90, 1)
reframed.head()

Unnamed: 0,var1(t-90),var1(t-89),var1(t-88),var1(t-87),var1(t-86),var1(t-85),var1(t-84),var1(t-83),var1(t-82),var1(t-81),...,var1(t-9),var1(t-8),var1(t-7),var1(t-6),var1(t-5),var1(t-4),var1(t-3),var1(t-2),var1(t-1),var1(t)
0,15.847149,18.017243,18.65806,15.365952,14.307753,10.566044,11.648431,9.954196,15.434844,15.58236,...,12.584953,11.935057,10.884787,6.497457,9.087294,9.75463,10.545367,4.80584,0.0,0.0
1,16.458478,15.847149,18.017243,18.65806,15.365952,14.307753,10.566044,11.648431,9.954196,15.434844,...,12.643693,12.584953,11.935057,10.884787,6.497457,9.087294,9.75463,10.545367,4.80584,0.0
2,17.83166,16.458478,15.847149,18.017243,18.65806,15.365952,14.307753,10.566044,11.648431,9.954196,...,14.177075,12.643693,12.584953,11.935057,10.884787,6.497457,9.087294,9.75463,10.545367,4.80584
3,17.161606,17.83166,16.458478,15.847149,18.017243,18.65806,15.365952,14.307753,10.566044,11.648431,...,16.139359,14.177075,12.643693,12.584953,11.935057,10.884787,6.497457,9.087294,9.75463,10.545367
4,15.072243,17.161606,17.83166,16.458478,15.847149,18.017243,18.65806,15.365952,14.307753,10.566044,...,13.766271,16.139359,14.177075,12.643693,12.584953,11.935057,10.884787,6.497457,9.087294,9.75463


In [52]:
reframed.values.shape

(49910, 91)

In [53]:
  df=reframed.iloc[0:1440]

In [54]:
train=reframed.drop("var1(t-90)",axis=1)
test=reframed[["var1(t-90)"]]
test

Unnamed: 0,var1(t-90)
0,15.847149
1,16.458478
2,17.831660
3,17.161606
4,15.072243
...,...
49905,11.939940
49906,11.099502
49907,11.033651
49908,9.761370


In [55]:
x=train.iloc[0:1440]
y=test.iloc[0:1440]
y.columns[0]

'var1(t-90)'

In [14]:
!pip install river

Collecting pandas<3.0.0,>=2.2.3 (from river)
  Using cached pandas-2.2.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (89 kB)
Collecting scipy<2.0.0,>=1.14.1 (from river)
  Downloading scipy-1.15.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.0/62.0 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
Using cached pandas-2.2.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.1 MB)
Downloading scipy-1.15.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (37.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m37.7/37.7 MB[0m [31m25.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scipy, pandas
  Attempting uninstall: scipy
    Found existing installation: scipy 1.11.4
    Uninstalling scipy-1.11.4:
      Successfully uninstalled scipy-1.11.4
  Attempting uninstall: pandas
    Found existing installation: pandas 2.

In [15]:
!pip install pycaret

Collecting pandas<2.2.0 (from pycaret)
  Using cached pandas-2.1.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Collecting scipy<=1.11.4,>=1.6.1 (from pycaret)
  Using cached scipy-1.11.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
Using cached pandas-2.1.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.2 MB)
Using cached scipy-1.11.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (36.4 MB)
Installing collected packages: scipy, pandas
  Attempting uninstall: scipy
    Found existing installation: scipy 1.15.3
    Uninstalling scipy-1.15.3:
      Successfully uninstalled scipy-1.15.3
  Attempting uninstall: pandas
    Found existing installation: pandas 2.2.3
    Uninstalling pandas-2.2.3:
      Successfully uninstalled pandas-2.2.3
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependenc

In [16]:
  import jinja2
  from pycaret.regression import setup, compare_models, pull, predict_model, finalize_model

Exception ignored on calling ctypes callback function: <function ThreadpoolController._find_libraries_with_dl_iterate_phdr.<locals>.match_library_callback at 0x7f95596dfd80>
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/threadpoolctl.py", line 1005, in match_library_callback
    self._make_controller_from_path(filepath)
  File "/usr/local/lib/python3.11/dist-packages/threadpoolctl.py", line 1187, in _make_controller_from_path
    lib_controller = controller_class(
                     ^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/threadpoolctl.py", line 114, in __init__
    self.dynlib = ctypes.CDLL(filepath, mode=_RTLD_NOLOAD)
                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.11/ctypes/__init__.py", line 376, in __init__
    self._handle = _dlopen(self._name, mode)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^
OSError: dlopen() error


In [56]:
target = test.columns[0]
numeric_features = x.columns.tolist()
target

'var1(t-90)'

In [57]:
  from datetime import datetime

In [58]:
from pycaret.regression import setup, compare_models, finalize_model

# Setup the environment
s = setup(data = df, target = "var1(t-90)", session_id = 123)

# Use the 'include' parameter to limit the models being compared,
# or adjust 'fold' parameter to reduce computation. This won't exactly stop at 80%,
# but it's a way to manage computation time.
# Here, you can specify a subset of models to reduce the computation load.
# Adjusting this list or other parameters can indirectly affect the total computation time.
models_to_compare = ['et', 'xgboost', 'rf','gbr','huber','lr','ridge','br','lar','omp']   # Example: Only include linear regression, decision tree, and random forest

best = compare_models(include=models_to_compare, sort='MAE', fold=5)  # Adjust 'fold' to control execution time

final_best_model = finalize_model(best)


Unnamed: 0,Description,Value
0,Session id,123
1,Target,var1(t-90)
2,Target type,Regression
3,Original data shape,"(1440, 91)"
4,Transformed data shape,"(1440, 91)"
5,Transformed train set shape,"(1007, 91)"
6,Transformed test set shape,"(433, 91)"
7,Numeric features,90
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
omp,Orthogonal Matching Pursuit,1.7408,4.5642,2.1332,0.4198,0.1653,0.1482,0.028
et,Extra Trees Regressor,1.7474,4.6056,2.1438,0.4142,0.1664,0.1493,2.624
rf,Random Forest Regressor,1.7585,4.6411,2.1525,0.4098,0.1676,0.1508,4.402
gbr,Gradient Boosting Regressor,1.782,4.8153,2.1927,0.3875,0.1702,0.1526,2.028
br,Bayesian Ridge,1.8037,4.8473,2.1986,0.384,0.1706,0.1544,0.05
lr,Linear Regression,1.8216,4.9704,2.2254,0.3685,0.1719,0.1547,0.048
ridge,Ridge Regression,1.8216,4.9698,2.2252,0.3685,0.1719,0.1547,0.05
lar,Least Angle Regression,1.843,5.0926,2.2522,0.3532,0.1738,0.1563,0.044
huber,Huber Regressor,1.8524,5.1365,2.2631,0.3464,0.1742,0.1565,0.12
xgboost,Extreme Gradient Boosting,1.8782,5.4812,2.3406,0.3021,0.1816,0.1612,2.782


Processing:   0%|          | 0/45 [00:00<?, ?it/s]

In [59]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, mean_squared_log_error, mean_absolute_percentage_error

In [60]:
final_best_model.fit(x,y)

In [61]:
final_best_model.predict(x)

array([16.41802485, 15.273997  , 15.36617401, ...,  9.23546282,
       10.88312059, 14.21879823])

In [63]:
x=train.iloc[0:1440]
final_best_model.predict(x)

array([16.41802485, 15.273997  , 15.36617401, ...,  9.23546282,
       10.88312059, 14.21879823])

In [64]:
def automodel(data):
  s = setup(data = data, target = "var1(t-90)",session_id = 123)

  models_to_compare = ['et', 'xgboost', 'rf','gbr','huber','lr','ridge','br','lar','omp']
  best = compare_models(include=models_to_compare, sort='MAE', fold=5)
  #best = compare_models(sort='MAE')
  best_model = finalize_model(best)
  return best_model

In [65]:
from river.drift import PageHinkley, KSWIN, ADWIN

In [66]:
from river.drift import PageHinkley, ADWIN, KSWIN

# Method 1 - Page Hinkley
ph = PageHinkley(min_instances=60, threshold=5)

# Method 2 - Adaptive Window
ad = ADWIN(delta=0.004)

# Method 3 - Kolmogorov-Smirnov Windowing method
ks = KSWIN(alpha=0.001, window_size=500, stat_size=150, seed=None)

# Dictionary of drift detection methods
methods = {
    "page-hinkley": ph,
    "adaptive-window": ad,
    "Kolmogorov-Smirnov": ks
}

# Dictionary to track drift detection results
drift_det = {
    "page-hinkley": 0,
    "adaptive-window": 0,
    "Kolmogorov-Smirnov": 0
}

In [79]:
drift_count = 0  # Initialize drift counter outside the loops

for val in y.values.flatten():
    for i, dd in methods.items():
        dd.update(val)  # Add new value to the drift detector
        if dd.drift_detected:  # Check if drift is detected
            drift_count += 1  # Increment drift counter
            print(f"Drift Detected at {val}, Total Drifts: {drift_count}")

Drift Detected at 12.129290686527796, Total Drifts: 1
Drift Detected at 12.487785307394622, Total Drifts: 2
Drift Detected at 12.8777040150667, Total Drifts: 3
Drift Detected at 15.782967954669044, Total Drifts: 4
Drift Detected at 12.267668606980925, Total Drifts: 5
Drift Detected at 10.128487279161392, Total Drifts: 6
Drift Detected at 14.730675844815847, Total Drifts: 7
Drift Detected at 12.41632769036412, Total Drifts: 8
Drift Detected at 14.919727081647356, Total Drifts: 9
Drift Detected at 11.929958325575695, Total Drifts: 10
Drift Detected at 9.27674933989764, Total Drifts: 11
Drift Detected at 20.046266103173565, Total Drifts: 12
Drift Detected at 12.934013680470969, Total Drifts: 13
Drift Detected at 13.231741383049176, Total Drifts: 14
Drift Detected at 13.287062717736957, Total Drifts: 15
Drift Detected at 15.134616794089084, Total Drifts: 16
Drift Detected at 14.553100503944853, Total Drifts: 17
Drift Detected at 6.703090544860819, Total Drifts: 18
Drift Detected at 14.3606

In [68]:
from river import drift

In [83]:
k = 1441 * 2
d = 0

# Drift detector from River
adwin = drift.ADWIN()

for i in range(1, 100):  # One segment
    x = train.iloc[k * i:k * (i + 1)]
    y = test.iloc[k * i:k * (i + 1)]
    dfval = y.values.flatten()

    for j in range(1, 1440):
        val = dfval[j - 1]

        # Detect drift based on prediction error (if available) or actual value
        if j > 1:
            # Ensure input is a DataFrame with correct columns
            pred_val = final_best_model.predict(x.iloc[[j - 1]])[0]
            error = abs(val - pred_val)
            adwin.update(error)
        else:
            adwin.update(val)  # Use actual value for early initialization

        if adwin.drift_detected:
            h = train.iloc[k * i:(k * i) + j]
            hp = test.iloc[k * i:(k * i) + j]
            pred = final_best_model.predict(h)
            mae = mean_absolute_error(hp, pred)

            if mae > 0.02:
                print(f"⚠️ Drift detected and performance degraded below threshold (MAE = {mae:.3f}), so we are updating model based on AutoML")
                d = k * i + j
                windf = reframed.iloc[d - 1440:d]
                x_val = train.iloc[d - 1440:d]
                y_val = test.iloc[d - 1440:d]
                model = automodel(windf)
                model.fit(x_val, y_val)
                final_best_model = model
                adwin = drift.ADWIN()  # Reset ADWIN
                k = d
            else:
                print(f"✅ Drift detected but performance is good with MAE: {mae:.3f}")

✅ Drift detected but performance is good with MAE: 0.000
⚠️ Drift detected and performance degraded below threshold (MAE = 0.303), so we are updating model based on AutoML


Unnamed: 0,Description,Value
0,Session id,123
1,Target,var1(t-90)
2,Target type,Regression
3,Original data shape,"(1440, 91)"
4,Transformed data shape,"(1440, 91)"
5,Transformed train set shape,"(1007, 91)"
6,Transformed test set shape,"(433, 91)"
7,Numeric features,90
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
et,Extra Trees Regressor,1.9008,5.1029,2.2582,0.3675,0.2525,0.2551,2.646
rf,Random Forest Regressor,1.9282,5.1935,2.2782,0.3564,0.254,0.2574,4.326
omp,Orthogonal Matching Pursuit,1.9395,5.2894,2.2993,0.3442,0.2552,0.2576,0.03
br,Bayesian Ridge,1.9452,5.3926,2.3214,0.3323,0.2587,0.261,0.052
lr,Linear Regression,1.9648,5.5234,2.3496,0.3154,0.2612,0.2603,0.054
ridge,Ridge Regression,1.9648,5.5229,2.3495,0.3155,0.2611,0.2603,0.052
gbr,Gradient Boosting Regressor,1.9684,5.5876,2.3633,0.3072,0.264,0.2646,2.028
lar,Least Angle Regression,1.9834,5.6833,2.3835,0.2952,0.2652,0.263,0.044
huber,Huber Regressor,1.9959,5.7541,2.3982,0.2859,0.2667,0.2639,0.136
xgboost,Extreme Gradient Boosting,2.0606,6.1106,2.4701,0.2438,0.2738,0.2728,2.304


Processing:   0%|          | 0/45 [00:00<?, ?it/s]

IndexError: index 0 is out of bounds for axis 0 with size 0