In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('/Users/aaronmatthews/Google Drive/My Drive/ml_datasets/lead/train.csv')
data['timestamp'] = pd.to_datetime(data['timestamp'])
data.set_index('timestamp', inplace=True)
data.head()

Unnamed: 0_level_0,building_id,meter_reading,anomaly
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2016-01-01,1,,0
2016-01-01,32,,0
2016-01-01,41,,0
2016-01-01,55,,0
2016-01-01,69,,0


In [3]:
from sklearn.model_selection import GroupShuffleSplit
gss = GroupShuffleSplit(n_splits=2, test_size=.2, random_state=42)

for train_index, valid_index in gss.split(data, groups = data.building_id):
    train, valid = data.iloc[train_index], data.iloc[valid_index]

In [4]:
train.building_id.unique()

array([  32,   41,   55,   79,   82,   91,  107,  111,  117,  118,  136,
        137,  139,  141,  144,  147,  148,  149,  159,  171,  173,  174,
        181,  183,  235,  238,  246,  248,  253,  254,  263,  270,  275,
        276,  278,  293,  312,  318,  335,  345,  356,  423,  439,  492,
        534,  623,  653,  657,  658,  666,  673,  675,  677,  683,  685,
        687,  693,  698,  701,  708,  710,  721,  729,  730,  732,  742,
        827,  844,  848,  880,  882,  884,  887,  889,  892,  893,  894,
        895,  896,  905,  914,  922,  924,  926,  928,  929,  931,  935,
        936,  942,  948,  950,  952,  961,  967,  968,  970,  973,  974,
        975,  977,  978,  988,  990,  994,  996, 1001, 1007, 1073, 1106,
       1120, 1128, 1137, 1141, 1143, 1147, 1172, 1219, 1225, 1226, 1232,
       1234, 1238, 1239, 1241, 1242, 1246, 1247, 1249, 1251, 1252, 1253,
       1257, 1258, 1259, 1260, 1261, 1264, 1266, 1272, 1275, 1278, 1283,
       1284, 1285, 1296, 1297, 1300, 1302, 1303, 13

In [17]:
X_train = train.reset_index()
X_train.set_index(['timestamp', 'building_id'], drop=True, inplace=True)
X_train = X_train[~X_train['meter_reading'].isnull()]

y_train = X_train['anomaly']
X_train.drop('anomaly', axis=1, inplace=True)
X_train

Unnamed: 0_level_0,Unnamed: 1_level_0,meter_reading
timestamp,building_id,Unnamed: 2_level_1
2016-01-01 00:00:00,107,175.184
2016-01-01 00:00:00,111,167.392
2016-01-01 00:00:00,117,16.306
2016-01-01 00:00:00,118,117.200
2016-01-01 00:00:00,137,14.250
...,...,...
2016-12-31 23:00:00,1315,32.520
2016-12-31 23:00:00,1316,38.844
2016-12-31 23:00:00,1318,202.893
2016-12-31 23:00:00,1323,172.000


In [18]:
y_train.mean()

0.0229242006108665

# Isolation Forest Baseline

In [7]:
from sklearn.ensemble import IsolationForest
from sklearn.metrics import roc_auc_score

In [8]:
iso = IsolationForest(contamination = 0.02, random_state=42)
iso.fit(X_train)

In [9]:
iso.n_estimators

100

In [10]:
preds = np.where(iso.predict(X_train) == -1, 1, 0)
preds


array([0, 0, 0, ..., 0, 0, 0])

In [11]:
roc_auc_score(y_train.values, preds)

0.5040197527753827

## Isolation Forest Hyperparameter Tuning

In [12]:
import optuna
from sklearn.model_selection import GroupKFold

  from .autonotebook import tqdm as notebook_tqdm


In [13]:
def objective(trial, X, y, n_splits, n_repeats, pipe_feed):
    random.seed(42)
    param_grid = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 150),
        'max_samples': trial.suggest_float('max_samples', 0.5, 1.0),
        'contamination': trial.suggest_float('contamination', 0.01, 0.05),
        # max features
        'bootstrap': trial.suggest_categorical('bootstrap', [True, False])
    }
    
    

In [25]:
n_splits = 5
cv = KFold(n_splits = n_splits, shuffle=True, random_state=42)

for idx, (train_idx, valid_idx) in enumerate(cv.split(X_train, y_train)):
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[valid_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[valid_idx]

In [53]:
n_splits = 5
gkf = GroupKFold(n_splits = n_splits)

counter = 0
for idx, (train_idx, valid_idx) in enumerate(gkf.split(X_train, 
                                                       y_train, 
                                                       groups=X_train.index.get_level_values(1).to_list())):
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[valid_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[valid_idx]
    counter += 1
    print(counter)

1
2
3
4
5


Notes
* You should be using a time-based split. You can take into account the previous average by building id. 

In [63]:
X_val.xs(111, level=1, axis=0)

Unnamed: 0_level_0,meter_reading
timestamp,Unnamed: 1_level_1
2016-01-01 00:00:00,167.392
2016-01-01 01:00:00,334.784
2016-01-01 02:00:00,1.000
2016-01-01 03:00:00,1.000
2016-01-01 04:00:00,1.000
...,...
2016-12-31 19:00:00,303.000
2016-12-31 20:00:00,299.125
2016-12-31 21:00:00,300.625
2016-12-31 22:00:00,302.125


In [52]:
X_train.shape

(1316818, 1)

In [None]:
from sklearn.model_selection import GroupShuffleSplit
gss = GroupShuffleSplit(n_splits=2, test_size=.2, random_state=42)

for train_index, valid_index in gss.split(data, groups = data.building_id):
    train, valid = data.iloc[train_index], data.iloc[valid_index]

You should split by both group and timestamp

In [14]:
from sklearn.model_selection import TimeSeriesSplit

# Example data
X = np.array([[1], [2], [3], [4], [5], [6], [7], [8], [9], [10]])
y = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])

# TimeSeriesSplit object
tscv = TimeSeriesSplit(n_splits=3)

# Performing the split
for train_index, test_index in tscv.split(X):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]


TRAIN: [0 1 2 3] TEST: [4 5]
TRAIN: [0 1 2 3 4 5] TEST: [6 7]
TRAIN: [0 1 2 3 4 5 6 7] TEST: [8 9]


In [22]:
from sklearn.datasets import make_classification
X, y = make_classification(n_samples = 20)

In [25]:
X_df = pd.DataFrame(X)
X_df['date_time'] = pd.to_datetime('10-01-2022')

for index, row in X_df.iterrows():
    X_df.loc[index, ['date_time']] = X_df.loc[index, ['date_time']] + pd.Timedelta(days = index)
    
X_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,date_time
0,0.588561,-0.193243,-1.22708,-0.000495,0.726908,-0.886884,-0.282666,-0.404685,-1.652335,-0.60707,...,-0.25887,1.539722,1.48948,1.091638,1.36781,-0.353755,0.784078,-0.391407,-0.635673,2022-10-01
1,-0.6787,0.365945,0.564208,0.297455,-0.152935,0.972394,0.342109,0.204658,0.059072,-0.38268,...,0.096582,1.081056,-0.278623,-1.725578,0.571373,-0.910999,-1.057464,0.061668,0.850646,2022-10-02
2,0.067124,-0.531725,-0.826345,-0.403787,0.507967,0.194544,-0.064925,-0.492427,0.327256,1.184605,...,-0.156632,-1.809878,-0.555484,2.44985,0.513788,0.626716,0.379857,1.328182,-1.217309,2022-10-03
3,-0.579354,0.77252,1.042365,1.016934,-0.056837,0.499802,-0.157241,0.998218,0.554115,0.023337,...,-1.222974,-0.275133,1.718604,0.63706,-0.438652,2.209976,0.969544,-0.317953,-0.899604,2022-10-04
4,-1.249835,0.238171,-0.504418,1.475661,-0.200072,2.124302,0.186242,-0.371064,1.398202,-0.211909,...,-1.308797,-0.297016,0.949872,-1.036064,0.064619,0.200498,2.410334,-1.055133,-0.163325,2022-10-05
5,-0.409181,0.265052,-0.616556,-1.596499,-1.537053,-2.73698,-0.273126,0.024902,0.548044,1.96928,...,1.593344,0.931128,0.539993,0.374438,0.670442,1.451918,1.28727,-0.030703,0.61155,2022-10-06
6,-0.813102,-0.847842,-1.166734,0.371249,1.051007,0.320052,-0.212405,0.063099,0.920132,0.02515,...,-0.736779,1.528494,-0.947632,1.454334,0.33364,-0.742174,0.036973,0.886268,-1.04015,2022-10-07
7,-1.013868,0.63195,0.260155,1.560572,-0.737168,-1.328835,-1.56371,-2.026295,1.678776,-1.043088,...,-1.467828,0.637047,-1.138001,-0.743339,2.609878,0.588979,-0.332431,-0.947573,-0.377978,2022-10-08
8,0.811173,-0.77208,1.318403,-0.595384,-0.112518,-1.053627,-0.004755,-1.563355,0.288746,-1.12261,...,1.03038,0.374095,-1.130289,-1.695976,0.330837,-1.253061,-0.199475,0.051984,1.297398,2022-10-09
9,-0.039829,-0.212668,-0.713316,-2.119751,1.950223,-0.647299,0.119071,-0.609751,-2.044126,-0.434583,...,2.574638,-0.46829,-0.641211,-1.434844,-0.417315,-1.126104,-1.822163,1.562314,1.937468,2022-10-10


In [26]:
cv_splits = RollingTimeSeriesCV(n_splits = 5, test_percent = 0.1)

rw_splits = cv_splits.split(X_df, y, 'date_time')
rw_splits

[(array([0, 1, 2, 3, 4, 5, 6, 7, 8]), array([ 9, 10])),
 (array([ 2,  3,  4,  5,  6,  7,  8,  9, 10]), array([11, 12])),
 (array([ 4,  5,  6,  7,  8,  9, 10, 11, 12]), array([13, 14])),
 (array([ 6,  7,  8,  9, 10, 11, 12, 13, 14]), array([15, 16])),
 (array([ 8,  9, 10, 11, 12, 13, 14, 15, 16]), array([17, 18]))]

In [23]:
from RollingTimeSeries import RollingTimeSeriesCV

In [20]:
X_train.index

MultiIndex([('2016-01-01 00:00:00',  107),
            ('2016-01-01 00:00:00',  111),
            ('2016-01-01 00:00:00',  117),
            ('2016-01-01 00:00:00',  118),
            ('2016-01-01 00:00:00',  137),
            ('2016-01-01 00:00:00',  139),
            ('2016-01-01 00:00:00',  141),
            ('2016-01-01 00:00:00',  144),
            ('2016-01-01 00:00:00',  147),
            ('2016-01-01 00:00:00',  149),
            ...
            ('2016-12-31 23:00:00', 1303),
            ('2016-12-31 23:00:00', 1304),
            ('2016-12-31 23:00:00', 1306),
            ('2016-12-31 23:00:00', 1310),
            ('2016-12-31 23:00:00', 1311),
            ('2016-12-31 23:00:00', 1315),
            ('2016-12-31 23:00:00', 1316),
            ('2016-12-31 23:00:00', 1318),
            ('2016-12-31 23:00:00', 1323),
            ('2016-12-31 23:00:00', 1353)],
           names=['timestamp', 'building_id'], length=1316818)