# Installs

In [1]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.2-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m16.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.2


# Imports

In [2]:
import pickle

In [3]:
import numpy as np
import pandas as pd

In [4]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, balanced_accuracy_score, confusion_matrix, matthews_corrcoef, precision_score, recall_score, f1_score


# Load data

In [5]:
with open('vm_c_rate_diff_if.pickle', 'rb') as handle:
   marked_vm_series = pickle.load(handle)

In [6]:
def get_supervised_dataset(marked_series, memory_steps=10, future_step=1):
    _, data, labels = marked_series
    X, y = [], []
    scaler = MinMaxScaler(feature_range=(0, 1))
    scaled_data = scaler.fit_transform(data.reshape(-1, 1))
    for i in range(len(data) - memory_steps + 1 - future_step):
        X.append(data[i:(i + memory_steps)])
        y.append(labels[i + memory_steps - 1 + future_step])
    return np.array(X), np.array(y).astype('int')

# Metric

In [7]:
def compute_results(y_true, y_pred):
    metrics = dict()
    metrics['Accuracy'] = accuracy_score(y_true, y_pred)
    metrics['Precision'] = precision_score(y_true, y_pred)
    metrics['Recall'] = recall_score(y_true, y_pred)
    metrics['F1-score'] = f1_score(y_true, y_pred)
    metrics['Balanced Accuracy'] = balanced_accuracy_score(y_true, y_pred)
    # metrics["Mathew's Correlation Coefficient"] = \
    #                                        matthews_corrcoef(y_true, y_pred)
    error_matrix = pd.DataFrame(
        confusion_matrix(y_true, y_pred, labels=[1, 0]),
        columns=['a(x) = 1', 'a(x) = 0'],
        index=['y = 1', 'y = 0'],
    ).to_dict()
    return metrics, error_matrix

# Final predict

## Boosting

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble import StackingClassifier

In [None]:
def fit_predict_xgbc(colour, memory_steps=50, future_step=1):
    X, y = get_supervised_dataset(marked_vm_series[colour], memory_steps=memory_steps, future_step=future_step)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify = y, random_state=42)
    params = {
        'min_child_weight': [1, 2, 5, 10],
        'gamma': [0.2, 0.5, 1],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5, 8]
        }
    xgbc = XGBClassifier(
        objective='binary:logistic',
        random_state=42,
        scale_pos_weight=49,
        learning_rate=0.2,
        n_estimators=500,
        verbosity=0,
        nthread=1
    )
    folds = 5
    param_comb = 5
    skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 42)
    random_search = RandomizedSearchCV(xgbc, param_distributions=params,
                                   n_iter=param_comb, scoring='f1',
                                   n_jobs=4, cv=skf.split(X_train, y_train),
                                   verbose=0, random_state=42 )
    random_search.fit(X_train, y_train)
    bxgbc = random_search.best_estimator_
    y_pred = bxgbc.predict(X_test)
    classifier_metrics, error_matrix = compute_results(y_test, y_pred)
    return bxgbc, y_pred, classifier_metrics, error_matrix

In [None]:
def fit_predict_lgbmc(colour, memory_steps=50, future_step=1):
    X, y = get_supervised_dataset(marked_vm_series[colour], memory_steps=memory_steps, future_step=future_step)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify = y, random_state=42)
    params = {
        'learning_rate': [0.1, 0.2, 0.3],
        'num_leaves': [6,8,12,16],
        'boosting_type' : ['gbdt', 'dart'],
        'colsample_bytree' : [0.64, 0.65, 0.66],
        'subsample' : [0.7, 0.725, 0.75],
        'max_depth': [-1, 3, 5, 8],
        'reg_alpha' : [1,1.2],
        'reg_lambda' : [1,1.2,1.4],
    }
    lgbmc = LGBMClassifier(
        verbose=-1,
        objective='binary',
        scale_pos_weight=50,
        min_gain_to_split=1,
        learning_rate=0.2,
        n_estimators=500,
        max_depth=5,
        random_state=42
    )
    folds = 5
    param_comb = 5
    skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 42)
    random_search = RandomizedSearchCV(lgbmc, param_distributions=params,
                                   n_iter=param_comb, scoring='f1',
                                   n_jobs=4, cv=skf.split(X_train, y_train),
                                   verbose=0, random_state=42 )
    random_search.fit(X_train, y_train)
    blgbmc = random_search.best_estimator_
    y_pred = blgbmc.predict(X_test)
    classifier_metrics, error_matrix = compute_results(y_test, y_pred)
    return blgbmc, y_pred, classifier_metrics, error_matrix

In [None]:
def fit_predict_cbc(colour, memory_steps=50, future_step=1):
    X, y = get_supervised_dataset(marked_vm_series[colour], memory_steps=memory_steps, future_step=future_step)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify = y, random_state=42)
    # params = {'learning_rate': [0.1, 0.2],'depth': [4, 6, 8, 10],'auto_class_weights': ['Balanced'],'l2_leaf_reg': [0, 5, 7],'random_seed': [42]}
    cbc = CatBoostClassifier(
        auto_class_weights = 'Balanced',
        learning_rate=0.2,
        depth=6,
        l2_leaf_reg=7,
        task_type='GPU',
        verbose=False,
        random_seed=42
    )
    #folds = 5
    #param_comb = 5
    #skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 42)
    #random_search = RandomizedSearchCV(cbc, param_distributions=params,n_iter=param_comb, scoring='f1',n_jobs=4, cv=skf.split(X_train, y_train),verbose=0, random_state=42)
    #random_search.fit(X_train, y_train)
    #bcbc = random_search.best_estimator_
    bcbc = cbc
    bcbc.fit(X_train, y_train)
    y_pred = bcbc.predict(X_test)
    classifier_metrics, error_matrix = compute_results(y_test, y_pred)
    return bcbc, y_pred, classifier_metrics, error_matrix

In [None]:
def fit_predict_cbc(colour, memory_steps=50, future_step=1):
    X, y = get_supervised_dataset(marked_vm_series[colour], memory_steps=memory_steps, future_step=future_step)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify = y, random_state=42)
    params = {'learning_rate': [0.1, 0.2],
              'depth': [4, 6, 8],
              'auto_class_weights': ['Balanced'],
              'l2_leaf_reg': [0, 5, 7],
              'random_seed': [42]}
    cbc = CatBoostClassifier(
        #auto_class_weights = 'Balanced',
        #learning_rate=0.2,
        #depth=6,
        #l2_leaf_reg=7,
        #task_type='GPU',
        thread_count = 4,
        verbose=False,
        num_trees=500,
        eval_metric='F1'
        #random_seed=42
    )
    folds = 5
    param_comb = 5
    skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 42)
    random_search = RandomizedSearchCV(cbc, param_distributions=params,n_iter=param_comb, scoring='f1',n_jobs=4, cv=skf.split(X_train, y_train),verbose=0, random_state=42)
    random_search.fit(X_train, y_train)
    bcbc = random_search.best_estimator_
    #bcbc = cbc
    bcbc.fit(X_train, y_train)
    y_pred = bcbc.predict(X_test)
    classifier_metrics, error_matrix = compute_results(y_test, y_pred)
    return bcbc, y_pred, classifier_metrics, error_matrix

In [None]:
def fit_predict_stack_logreg(estimators, colour, memory_steps=50, future_step=1):
    X, y = get_supervised_dataset(marked_vm_series[colour], memory_steps=memory_steps, future_step=future_step)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify = y, random_state=42)
    named_estimators = [(name, estimator) for name, estimator in zip([f'{i}' for i in range(len(estimators))], estimators)]
    clf = StackingClassifier(estimators=named_estimators,
                             final_estimator=LogisticRegressionCV(random_state=42, class_weight='balanced', cv=5, scoring='f1'))
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    classifier_metrics, error_matrix = compute_results(y_test, y_pred)
    return clf, y_pred, classifier_metrics, error_matrix

In [None]:
def all_predictions(colour, memory_steps=50, future_steps=1):
    classifiers_metrics = dict()
    classifiers_error_matrix = dict()
    print(f'Colour: {colour}')
    xgbc, _, classifier_metrics, error_matrix = fit_predict_xgbc(colour, memory_steps, future_steps)
    classifiers_metrics['xgbc'] = classifier_metrics
    classifiers_error_matrix['xgbc'] = error_matrix
    print('Done xgbc')
    lgbmc, _, classifier_metrics, error_matrix = fit_predict_lgbmc(colour, memory_steps, future_steps)
    classifiers_metrics['lgbmc'] = classifier_metrics
    classifiers_error_matrix['lgbmc'] = error_matrix
    print('Done lgbm')
    cbc, _, classifier_metrics, error_matrix = fit_predict_cbc(colour, memory_steps, future_steps)
    classifiers_metrics['cbc'] = classifier_metrics
    classifiers_error_matrix['cbc'] = error_matrix
    print('Done cbc')
    stackc, _,  classifier_metrics, error_matrix = fit_predict_stack_logreg([xgbc, lgbmc, cbc], colour, memory_steps, future_steps)
    classifiers_metrics['stackc'] = classifier_metrics
    classifiers_error_matrix['cbc'] = error_matrix
    print('Done stackc')
    return classifiers_metrics, classifiers_error_matrix

In [None]:
# c, e = all_predictions('brown', memory_steps=50, future_steps=1)

In [None]:
# pd.DataFrame(c)

In [None]:
colour_classifier_metrics = dict()
colour_confusion_matrix = dict()

In [None]:
# for colour in ['brown', 'pink', 'white', 'blue', 'violet']:
  #   colour_classifier_metrics[colour], colour_confusion_matrix[colour] = all_predictions(colour, memory_steps=50, future_steps=1)

In [None]:
#pd.DataFrame.from_dict({(i,j): colour_classifier_metrics[i][j]
#                           for i in colour_classifier_metrics.keys()
#                           for j in colour_classifier_metrics[i].keys()},
#                       orient='index').transpose()

In [None]:
future_steps = [1, 2, 3, 4, 5]

In [None]:
for steps in future_steps:
    print(f'Steps = {steps}')
    colour_classifier_metrics[steps] = dict()
    colour_confusion_matrix[steps] = dict()
    for colour in ['brown', 'pink', 'white', 'blue', 'violet']:
        colour_classifier_metrics[steps][colour], colour_confusion_matrix[steps][colour] = \
            all_predictions(colour, memory_steps=50, future_steps=steps)

Steps = 1
Colour: brown
Done xgbc
Done lgbm
Done cbc
Done stackc
Colour: pink
Done xgbc
Done lgbm
Done cbc
Done stackc
Colour: white
Done xgbc
Done lgbm
Done cbc
Done stackc
Colour: blue
Done xgbc
Done lgbm
Done cbc
Done stackc
Colour: violet
Done xgbc
Done lgbm
Done cbc
Done stackc
Steps = 2
Colour: brown
Done xgbc
Done lgbm
Done cbc
Done stackc
Colour: pink
Done xgbc
Done lgbm
Done cbc
Done stackc
Colour: white
Done xgbc
Done lgbm
Done cbc
Done stackc
Colour: blue
Done xgbc
Done lgbm
Done cbc
Done stackc
Colour: violet
Done xgbc
Done lgbm
Done cbc
Done stackc
Steps = 3
Colour: brown
Done xgbc
Done lgbm
Done cbc
Done stackc
Colour: pink
Done xgbc
Done lgbm
Done cbc
Done stackc
Colour: white
Done xgbc
Done lgbm
Done cbc
Done stackc
Colour: blue
Done xgbc
Done lgbm
Done cbc
Done stackc
Colour: violet
Done xgbc
Done lgbm
Done cbc
Done stackc
Steps = 4
Colour: brown
Done xgbc
Done lgbm
Done cbc
Done stackc
Colour: pink
Done xgbc
Done lgbm
Done cbc
Done stackc
Colour: white
Done xgbc
Done 

  _warn_prf(average, modifier, msg_start, len(result))


Done xgbc
Done lgbm
Done cbc
Done stackc


In [None]:
with open('colour_classifier_metrics_steps_cv.pickle', 'wb') as handle:
   pickle.dump(colour_classifier_metrics, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
with open('colour_confusion_matrix_steps_cv..pickle', 'wb') as handle:
   pickle.dump(colour_confusion_matrix, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
def display_metrics(steps):
    return pd.DataFrame.from_dict({(i,j): colour_classifier_metrics[steps][i][j]
                           for i in colour_classifier_metrics[steps].keys()
                           for j in colour_classifier_metrics[steps][i].keys()},
                       orient='index').transpose()

In [None]:
display_metrics(1)

Unnamed: 0_level_0,brown,brown,brown,brown,pink,pink,pink,pink,white,white,white,white,blue,blue,blue,blue,violet,violet,violet,violet
Unnamed: 0_level_1,xgbc,lgbmc,cbc,stackc,xgbc,lgbmc,cbc,stackc,xgbc,lgbmc,cbc,stackc,xgbc,lgbmc,cbc,stackc,xgbc,lgbmc,cbc,stackc
Accuracy,0.983928,0.975892,0.97991,0.945756,0.974887,0.96886,0.979407,0.8001,0.976394,0.9111,0.974385,0.601708,0.970367,0.903064,0.975892,0.763435,0.977398,0.907584,0.978905,0.834756
Precision,0.681818,0.409091,0.5,0.216667,0.210526,0.129032,0.333333,0.047859,0.0,0.079755,0.133333,0.032298,0.047619,0.030675,0.0,0.022173,0.0,0.026316,0.0,0.03537
Recall,0.375,0.45,0.4,0.65,0.102564,0.102564,0.051282,0.487179,0.0,0.325,0.05,0.65,0.025,0.125,0.0,0.25,0.0,0.1,0.0,0.275
F1-score,0.483871,0.428571,0.444444,0.325,0.137931,0.114286,0.088889,0.087156,0.0,0.128079,0.072727,0.061538,0.032787,0.049261,0.0,0.040733,0.0,0.041667,0.0,0.062678
Balanced Accuracy,0.685706,0.718337,0.6959,0.80091,0.54744,0.544366,0.524616,0.646766,0.498206,0.624058,0.521668,0.625359,0.507374,0.522008,0.49795,0.511981,0.498719,0.512071,0.499487,0.560616


In [None]:
display_metrics(3)

Unnamed: 0_level_0,brown,brown,brown,brown,pink,pink,pink,pink,white,white,white,white,blue,blue,blue,blue,violet,violet,violet,violet
Unnamed: 0_level_1,xgbc,lgbmc,cbc,stackc,xgbc,lgbmc,cbc,stackc,xgbc,lgbmc,cbc,stackc,xgbc,lgbmc,cbc,stackc,xgbc,lgbmc,cbc,stackc
Accuracy,0.978392,0.970854,0.979397,0.951759,0.972864,0.970352,0.977889,0.805025,0.976382,0.959799,0.974372,0.808543,0.974874,0.907538,0.970352,0.714573,0.977889,0.896985,0.971357,0.51206
Precision,0.451613,0.3125,0.484848,0.230769,0.058824,0.083333,0.272727,0.051414,0.0,0.045455,0.0,0.045333,0.083333,0.026316,0.0,0.018248,0.0,0.011834,0.0,0.022564
Recall,0.35,0.375,0.4,0.6,0.025641,0.051282,0.076923,0.512821,0.0,0.05,0.0,0.425,0.025,0.1,0.0,0.25,0.0,0.05,0.0,0.55
F1-score,0.394366,0.340909,0.438356,0.333333,0.035714,0.063492,0.12,0.093458,0.0,0.047619,0.0,0.081928,0.038462,0.041667,0.0,0.034014,0.0,0.019139,0.0,0.04335
Balanced Accuracy,0.670641,0.679038,0.695641,0.779487,0.50872,0.520003,0.536411,0.661843,0.498205,0.514231,0.497179,0.620705,0.509679,0.512051,0.495128,0.487051,0.498974,0.482179,0.495641,0.530641


In [None]:
display_metrics(5)

Unnamed: 0_level_0,brown,brown,brown,brown,pink,pink,pink,pink,white,white,white,white,blue,blue,blue,blue,violet,violet,violet,violet
Unnamed: 0_level_1,xgbc,lgbmc,cbc,stackc,xgbc,lgbmc,cbc,stackc,xgbc,lgbmc,cbc,stackc,xgbc,lgbmc,cbc,stackc,xgbc,lgbmc,cbc,stackc
Accuracy,0.980905,0.970352,0.977387,0.951759,0.976382,0.972864,0.979397,0.876884,0.968342,0.962814,0.977889,0.720603,0.973869,0.9,0.971859,0.630151,0.979899,0.89196,0.976884,0.468342
Precision,0.535714,0.306122,0.424242,0.214286,0.214286,0.173913,0.375,0.070833,0.0,0.027778,0.0,0.034296,0.071429,0.023952,0.0,0.02459,0.0,0.016575,0.0,0.024299
Recall,0.375,0.375,0.35,0.525,0.076923,0.102564,0.076923,0.435897,0.0,0.025,0.0,0.475,0.025,0.1,0.0,0.45,0.0,0.075,0.0,0.65
F1-score,0.441176,0.337079,0.383562,0.304348,0.113208,0.129032,0.12766,0.121864,0.0,0.026316,0.0,0.063973,0.037037,0.038647,0.0,0.046632,0.0,0.027149,0.0,0.046847
Balanced Accuracy,0.684167,0.678782,0.670128,0.742756,0.535642,0.546413,0.53718,0.660799,0.494103,0.503526,0.498974,0.600321,0.509167,0.508205,0.495897,0.541923,0.5,0.491859,0.498462,0.557308


In [None]:
for i in range(1, 6):
    (display_metrics(i)).to_csv(f'f{i}.csv')

## LSTM

In [8]:
X, y = get_supervised_dataset(marked_vm_series['brown'], memory_steps=50, future_step=1)
y = y.astype('float32')

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify = y, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, stratify = y_train, random_state=42)

In [10]:
import numpy as np
import tensorflow.keras as keras
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
import matplotlib.pyplot as plt
from sklearn.utils import class_weight

In [11]:
model = tf.keras.Sequential([
    tf.keras.layers.LSTM(50, input_shape=(X_train.shape[1], 1), return_sequences=True),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.LSTM(30, return_sequences=True),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.LSTM(20),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

In [12]:
class_weights = class_weight.compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)

In [13]:
class_weights

array([ 0.51025641, 24.875     ])

In [14]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy', keras.metrics.Recall()])

In [15]:
model.fit(X_train, y_train, validation_data=(X_val, y_val),
          epochs=100, batch_size=32,
          class_weight={0: class_weights[0], 1: class_weights[1]}
          #class_weight={0:1, 1:48}
          )

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.src.callbacks.History at 0x7b895c1262f0>

In [16]:
y_pred = model.predict(X_test)
y_pred_binary = (y_pred > 0.5).astype(int)



Model overfits!!

In [17]:
c, e = compute_results(y_test, y_pred_binary)

  _warn_prf(average, modifier, msg_start, len(result))


In [18]:
c

{'Accuracy': 0.9799095931692616,
 'Precision': 0.0,
 'Recall': 0.0,
 'F1-score': 0.0,
 'Balanced Accuracy': 0.5}

In [19]:
pd.DataFrame(e)

Unnamed: 0,a(x) = 1,a(x) = 0
y = 1,0,40
y = 0,0,1951
