Import all the necessary packages

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
import nltk
from nltk.corpus import stopwords
from ntlk.stem import WordNetLemmatizer
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, BertModel
from torch.utils.data import DataLoader, TensorDataset
from sklearn.ensemble import VotingClassifier
from bayes_opt import BayesianOptimization
import autokeras as ak
from fastai.tabular.all import *

Load dataset files

In [None]:
event_traces = pd.read_csv('Event_traces.csv')
log_templates = pd.read_csv('HDFS.log_templates.csv')
anomaly_labels = pd.read_csv('anomaly_label.csv')
event_occurence = pd.read_csv('Event_occurence_matrix.csv')

Step 1 : Data Integration, Feature engineering, Text Data Preprocessing and Feature Combination

Merge relevant datasets using BlockID as key

In [None]:
merged_data = event_traces.merge(anomaly_labels, on='BlockID')
merged_data = merged_data.merge(event_occurences, on='BlockID')
merged_data = merged_data.merge(log_template, on='EventID')

Feature Engineering

1. Block Attributes

1.1 Block attributes including one-hot encoding for categorical 'Label'

In [None]:
block_attributes = merged_data[['Time Interval', 'Latency']]
label_encoder = OneHotEncoder(sparse=False)
encoded_labels = label_encoder.fit_transform(merged_data[['Label']])
block_attributes = pd.concat([block_attributes, pd.DataFrame(encoded_labels, columns = label_encoder.get_feature_names(['Label']))], axis = 1)

1.2 One-hot encoding for categorical 'Type' attribute

In [None]:
type_encoder = OneHotEncoder(sparse=False)
encoded_types = type_encoder.fit_transform(merged_data[['Type']])
block_attributes = pd.concat([block_attributes, pd.DataFrame(encoded_types, columns = type_encoder.get_feature_names(['Type']))], axis = 1)

2. Log Templates

In [None]:
text_data = merged_data['EventTemplate']

3. Text Data Processing

In [None]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

4. Tokenization and lowercasing

In [None]:
text_data = teaxt_data.apply(lambda x : nltk.word_tokenize(x.lower()))

5. Stop words removal

In [None]:
stop_words = set(stopwords.words('english'))
text_data = text_data.apply(lambda x : [word for word in x if word not in stop_words])

6. Lemmatization

In [None]:
lemmatizer = WordNetLemmatizer()
text_data = text_data.apply(lambda x : [lemmatizer.lemmatize(word) for word in x])

Convert back to text

In [None]:
text_data = text_data.apply(lambda x: ' '.join(x))

Combine Features

Vectorize text data

In [None]:
vectorizer = CountVectorizer(max_features = 1000)
text_features = vectorizer.fit_transform(text_data).toarray()

Standardize numerical attributes

In [None]:
scaler = StandardScaler()
num_attributes = scaler.fit_transform(block_attributes)

Combine text and numerical features

In [None]:
combined_features = pd.concat([pd.DataFrame(text_features), pd.DataFrame(num_attributes)], axis = 1)

Data Splitting

Split the data into training, validation and testing sets(80%, 10%, 10%)

In [None]:
X_train, X_temp, y_train, y_temp = train_test_split (
    combined_features, merged_data['Label'], test_size=0.2, random_state=42, stratify=merged_data['Label']
)

In [None]:
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)

Print the shapes of the split datasets

In [None]:
print("Training data shape: ", X_train.shape)
print("Validation data shape: ", X_val.shape)
print("Testing data shape: ", X_test.shape)

Step 2 : Model Selection and Development

I. Prepare the tabular data

seperate features and labels

In [None]:
X_train_tabular = X_train.iloc[:,1000:]
X_val_tabular = X_val.iloc[:,1000:]
y_train_tabular = y_train
y_val_tabular = y_val

Hyperparamter tuning for XGBoost

Create a ColumnTransformer for numerical and categorical features

In [None]:
numeric_features = X_train_tabular.columns[:2]
categorical_features = X_train_tabular.columns[2:]
preprocessor = ColumnTransformer(
    transformers = [
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(), categorical_features)
    ]
)

Define XGBoost model

In [None]:
xgb_model = XGBClassifier(
    objective='binary:logistic',
    random_state=42,
    eval_metric='logloss'
)

Create a pipeline

In [None]:
pipeline = Pipeline(steps=[('preprocessor', preprocessor),('model',xgb_model)])

Define Hyperparameters to tune

In [None]:
param_grid={
    'model__n_estimators': [50, 100, 150],
    'model__max_depth': [4, 6, 8],
    'model__learning_rate': [0.01, 0.1, 0.2],
}

Using GridSearchCV for hyperparameter tuning

In [None]:
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')
grid_search.fix(X_train_tabular, y_train_tabular)

best_params = grid_search.best_params_
print("Best Parameters : ", best_params)

best_model = grid_search.best_estimator_

Evaulate the Tuned XGBoost Model

In [None]:
#Predict on the validation set
y_pred_val_tabular_tuned = best_model.predict(X_val_tabular)

Calculating accuracy

In [None]:
tuned_accuracy = accuracy_score(y_Val_tabular, y_pred_val_tabular_tuned)
print("Validation Accuracy (Tuned XGBoost) : ", tuned_accuracy)

II. Text Classification Models

In [None]:
# Tokenize Text Data using BERT Tokenizer
tokenize = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_Case=True)

In [None]:
# Tokenize text and convert to input IDs and attention masks
encoded_text = tokenizer.batch_encode_plus(
    text_data,
    add_special_tokens=Ture,
    padding='max_length',
    truncation=True,
    return_attention_mask=True,
    return_tensors='pt',
    max_length=128
)

input_ids = encoded_text['input_ids']
attention_mask = encoded_text['attention_mask']

In [None]:
# Convert labels to tensors
y_train_text = torch.tensor(y_train)
y_Val_text = torch.tensor(y_val)

In [None]:
# Create TensorDatasets for text data
train_dataset = TensorDataset(input_ids, attention_mask, y_train_text)
val_dataset = TensorDataset(input_ids, attention_mask, y_val_text)

In [None]:
# Define DataLoader for batching
train_batch_size = 16  # Adjust as needed
val_batch_size = 16  # Adjust as needed

train_loader = DataLoader(train_dataset, batch_size=train_batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=val_batch_size, shuffle=False)

In [None]:
# Load Pre-trained BERT Model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Optimizer and Loss Function
optimizer = AdamW(model.parameters(), lr=2e-5)
criterion = torch.nn.CrossEntropyLoss()

# Training Loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

epochs = 3  # Adjust as needed

for epoch in range(epochs):
    model.train()
    total_loss = 0

    for batch in train_loader:
        batch = tuple(t.to(device) for t in batch)
        input_ids, attention_mask, labels = batch

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{epochs}, Avg. Loss: {avg_loss:.4f}")

In [None]:
# Evaluation on Validation Set
model.eval()
y_pred_text = []
y_true_text = []

with torch.no_grad():
    for batch in val_loader:
        batch = tuple(t.to(device) for t in batch)
        input_ids, attention_mask, labels = batch

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)

        y_pred_text.extend(preds.cpu().numpy())
        y_true_text.extend(labels.cpu().numpy())

val_accuracy = accuracy_score(y_true_text, y_pred_text)
print("Validation Accuracy (BERT):", val_accuracy)

III. Time Series Analysis

In [None]:
# Create a time series dataset from the 'Time Interval' and 'Latency' attributes
time_series_data = merged_data[['Time Interval', 'Latency']].copy()

In [None]:
# Normalize the time series data
time_series_data['Time Interval'] = (time_series_data['Time Interval'] - time_series_data['Time Interval'].min()) / (time_series_data['Time Interval'].max() - time_series_data['Time Interval'].min())
time_series_data['Latency'] = (time_series_data['Latency'] - time_series_data['Latency'].min()) / (time_series_data['Latency'].max() - time_series_data['Latency'].min())

In [None]:
# Create input sequences and target values for LSTM
sequence_length = 10  # Adjust as needed

input_sequences = []
target_values = []

for i in range(len(time_series_data) - sequence_length):
    input_seq = time_series_data.iloc[i:i+sequence_length]['Latency'].values
    target_value = time_series_data.iloc[i+sequence_length]['Latency']

    input_sequences.append(input_seq)
    target_values.append(target_value)

input_sequences = np.array(input_sequences)
target_values = np.array(target_values)

In [None]:
# Split the time series data into training and validation sets
X_train_lstm, X_val_lstm, y_train_lstm, y_val_lstm = train_test_split(input_sequences, target_values, test_size=0.2, random_state=42, shuffle=False)

In [None]:
# Build the LSTM model
model = Sequential()
model.add(LSTM(64, input_shape=(sequence_length, 1)))
model.add(Dense(1))
model.compile(optimizer='adam', loss='mean_squared_error')

In [None]:
# Train the LSTM model
model.fit(X_train_lstm, y_train_lstm, epochs=50, batch_size=16, validation_data=(X_val_lstm, y_val_lstm))


In [None]:
# Evaluate the LSTM model on validation data
y_pred_lstm = model.predict(X_val_lstm)
mse_lstm = mean_squared_error(y_val_lstm, y_pred_lstm)
print("Mean Squared Error (LSTM):", mse_lstm)

In [None]:
# Plot actual vs. predicted values
plt.figure(figsize=(10, 6))
plt.plot(y_val_lstm, label='Actual Latency')
plt.plot(y_pred_lstm, label='Predicted Latency')
plt.xlabel('Time')
plt.ylabel('Latency')
plt.legend()
plt.show()

IV. Multi Modal Approach

In [None]:
# Prepare the BERT model for text data
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
bert_model = BertModel.from_pretrained('bert-base-uncased')

In [None]:
# Tokenize text and convert to input IDs and attention masks
encoded_text = tokenizer.batch_encode_plus(
    text_data,
    add_special_tokens=True,
    padding='max_length',
    truncation=True,
    return_attention_mask=True,
    return_tensors='pt',
    max_length=128  # Adjust as needed
)

input_ids = encoded_text['input_ids']
attention_mask = encoded_text['attention_mask']

In [None]:
# Convert labels to tensors
y_train_text = torch.tensor(y_train)
y_val_text = torch.tensor(y_val)

In [None]:
# Create TensorDatasets for text data
train_dataset = TensorDataset(input_ids, attention_mask, y_train_text)
val_dataset = TensorDataset(input_ids, attention_mask, y_val_text)

In [None]:
# Define DataLoader for batching
train_batch_size = 16  # Adjust as needed
val_batch_size = 16  # Adjust as needed

train_loader = DataLoader(train_dataset, batch_size=train_batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=val_batch_size, shuffle=False)

In [None]:
# Adaptation: Prepare structured features and labels
train_loader_structured = torch.tensor(X_train)  # Adjust based on your structured data
val_loader_structured = torch.tensor(X_val)      # Adjust based on your structured data
train_loader_labels = torch.tensor(y_train)      # Adjust based on your labels
val_loader_labels = torch.tensor(y_val)          # Adjust based on your labels

In [None]:
# Build a neural network for fusion
class MultiModalFusion(nn.Module):
    def __init__(self, text_embedding_dim, num_structured_features, hidden_dim, output_dim):
        super(MultiModalFusion, self).__init__()
        self.bert = bert_model
        self.xgboost_fc = nn.Linear(num_structured_features, hidden_dim)
        self.fusion_fc = nn.Linear(text_embedding_dim + hidden_dim, output_dim)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.3)

    def forward(self, text_input_ids, text_attention_mask, structured_features):
        text_outputs = self.bert(text_input_ids, attention_mask=text_attention_mask)[0]
        text_pooled_output = torch.mean(text_outputs, dim=1)

        xgboost_features = self.xgboost_fc(structured_features)

        fusion_input = torch.cat((text_pooled_output, xgboost_features), dim=1)
        fusion_input = self.relu(fusion_input)
        fusion_input = self.dropout(fusion_input)

        output = self.fusion_fc(fusion_input)
        return output

In [None]:
# Initialize and set up the model, optimizer, and loss function
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
multi_modal_model = MultiModalFusion(text_embedding_dim=768, num_structured_features=combined_features.shape[1], hidden_dim=128, output_dim=2)
multi_modal_model.to(device)

optimizer = optim.Adam(multi_modal_model.parameters(), lr=0.001)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=2, factor=0.5, verbose=True)
criterion = nn.CrossEntropyLoss()


In [None]:
# Training loop with early stopping
epochs = 10  # Adjust as needed
best_val_accuracy = 0.0
early_stopping_patience = 5
early_stopping_counter = 0

for epoch in range(epochs):
    multi_modal_model.train()
    total_loss = 0

    for batch_text, batch_structured, batch_labels in zip(train_loader, train_loader_structured, train_loader_labels):
        batch_text = tuple(t.to(device) for t in batch_text)
        batch_structured = batch_structured.to(device)
        batch_labels = batch_labels.to(device)

        optimizer.zero_grad()
        outputs = multi_modal_model(*batch_text, batch_structured)
        loss = criterion(outputs, batch_labels)
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{epochs}, Avg. Loss: {avg_loss:.4f}")

    # Evaluate on validation set
    multi_modal_model.eval()
    val_predictions = []
    val_labels = []

    with torch.no_grad():
        for batch_text, batch_structured, batch_labels in zip(val_loader, val_loader_structured, val_loader_labels):
            batch_text = tuple(t.to(device) for t in batch_text)
            batch_structured = batch_structured.to(device)
            batch_labels = batch_labels.to(device)

            outputs = multi_modal_model(*batch_text, batch_structured)
            preds = torch.argmax(outputs, dim=1)

            val_predictions.extend(preds.cpu().numpy())
            val_labels.extend(batch_labels.cpu().numpy())

    val_accuracy = accuracy_score(val_labels, val_predictions)
    print(f"Validation Accuracy: {val_accuracy:.4f}")

    # Update learning rate scheduler
    scheduler.step(val_accuracy)

    # Early stopping
    if val_accuracy > best_val_accuracy:
        best_val_accuracy = val_accuracy
        early_stopping_counter = 0
    else:
        early_stopping_counter += 1

    if early_stopping_counter >= early_stopping_patience:
        print("Early stopping triggered.")
        break

V. Deep Learning Architecture

In [None]:
# Prepare the BERT model for text data
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
bert_model = BertModel.from_pretrained('bert-base-uncased')

In [None]:
# Tokenize text and convert to input IDs and attention masks
encoded_text = tokenizer.batch_encode_plus(
    text_data,
    add_special_tokens=True,
    padding='max_length',
    truncation=True,
    return_attention_mask=True,
    return_tensors='pt',
    max_length=128  # Adjust as needed
)

input_ids = encoded_text['input_ids']
attention_mask = encoded_text['attention_mask']

In [None]:
# Convert labels to tensors
y_train_text = torch.tensor(y_train)
y_val_text = torch.tensor(y_val)

# Create TensorDatasets for text data
train_dataset = TensorDataset(input_ids, attention_mask, y_train_text)
val_dataset = TensorDataset(input_ids, attention_mask, y_val_text)

In [None]:
# Define DataLoader for batching
train_batch_size = 16  # Adjust as needed
val_batch_size = 16    # Adjust as needed

train_loader = DataLoader(train_dataset, batch_size=train_batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=val_batch_size, shuffle=False)

In [None]:
# Convert structured data to TabularPandas format
cont_names = list(range(combined_features.shape[1]))  # Assuming structured features are numerical
cat_names = []  # No categorical features in this scenario
procs = [Categorify, Normalize]

# Create TabularPandas objects
tabular_data = TabularPandas(pd.DataFrame(combined_features), procs=procs, cat_names=cat_names, cont_names=cont_names, y_names='Label')

# Split data and create DataLoaders
splits = RandomSplitter(valid_pct=0.2)(range_of(tabular_data))
to = tabular_data.new(itemgetter(splits))
dls = to.dataloaders(bs=64)


In [None]:
# Define Tabular CNN architecture
class TabularCNN(Module):
    def __init__(self, emb_szs, n_cont, out_sz, layers, ps=0.5):
        self.embeds = nn.ModuleList([nn.Embedding(ni, nf) for ni, nf in emb_szs])
        self.emb_drop = nn.Dropout(ps)
        self.bn_cont = nn.BatchNorm1d(n_cont)

        layer_list = []
        n_emb = sum((nf for ni, nf in emb_szs))
        n_in = n_emb + n_cont

        for n_out in layers:
            layer_list.append(nn.Linear(n_in, n_out))
            layer_list.append(nn.ReLU(inplace=True))
            layer_list.append(nn.BatchNorm1d(n_out))
            layer_list.append(nn.Dropout(ps))
            n_in = n_out

        layer_list.append(nn.Linear(layers[-1], out_sz))

        self.layers = nn.Sequential(*layer_list)

    def forward(self, x_cat, x_cont):
        embeddings = []
        for i, e in enumerate(self.embeds):
            embeddings.append(e(x_cat[:, i]))
        x = torch.cat(embeddings, 1)
        x = self.emb_drop(x)

        x_cont = self.bn_cont(x_cont)
        x = torch.cat([x, x_cont], 1)
        x = self.layers(x)
        return x

In [None]:
# Initialize and set up the model, optimizer, and loss function
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tabular_cnn_model = TabularCNN(emb_szs, len(cont_names), 2, [200, 100], ps=0.5)
tabular_cnn_model.to(device)

# Create FastAI Learner
learn = Learner(dls, tabular_cnn_model, opt_func=Adam, loss_func=nn.CrossEntropyLoss(), metrics=accuracy)

# Fine-tune the model
learn.fine_tune(5, base_lr=1e-3)

VI. Ensembling Models (Combining Mulitple Models)

Using Voting Classifier : Combining the predictions of multiple models

In [None]:
# Define the list of models to include in the ensemble
model_list = [('xgboost', xgb_model), ('lstm', lstm_model), ('tabular_cnn', tabular_cnn_model)]

# Create the Voting Classifier
voting_classifier = VotingClassifier(estimators=model_list, voting='soft')

# Define hyperparameters to search
params = {
    'xgboost__n_estimators': [50, 100, 150],
    'lstm__hidden_size': [64, 128, 256],
    'tabular_cnn__layers': [[200, 100], [300, 150]],
    'voting': ['soft', 'hard']
}


In [None]:
# Perform GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(estimator=voting_classifier, param_grid=params, scoring='accuracy', cv=3)
grid_search.fit(X_train, y_train)

# Get the best ensemble model
best_ensemble_model = grid_search.best_estimator_

# Predict using the best ensemble model
ensemble_preds = best_ensemble_model.predict(X_val)

# Calculate ensemble accuracy
ensemble_accuracy = accuracy_score(y_val, ensemble_preds)
print(f"Ensemble Accuracy: {ensemble_accuracy:.4f}")

VII. Hyper Paramter Tuning and Optimization

In [None]:
# Define the hyperparameter bounds for each model
param_bounds = {
    'xgboost_n_estimators': (50, 150),
    'xgboost_max_depth': (3, 7),
    # Add more bounds for XGBoost hyperparameters
    'lstm_hidden_size': (64, 256),
    'lstm_num_layers': (1, 3),
    # Add more bounds for LSTM hyperparameters
    'tabular_cnn_layers_0': (50, 200),
    'tabular_cnn_layers_1': (25, 100),
    'tabular_cnn_ps': (0.3, 0.7),
    # Add more bounds for Tabular CNN hyperparameters
}

In [None]:
# Define the objective function to maximize (accuracy)
def objective_function(**params):
    # Create the Voting Classifier with given hyperparameters
    ensemble_model = VotingClassifier(estimators=model_list, voting='soft')

    # Set XGBoost hyperparameters
    ensemble_model.xgboost.n_estimators = int(params['xgboost_n_estimators'])
    ensemble_model.xgboost.max_depth = int(params['xgboost_max_depth'])
    # Set other XGBoost hyperparameters similarly

    # Set LSTM hyperparameters
    ensemble_model.lstm.hidden_size = int(params['lstm_hidden_size'])
    ensemble_model.lstm.num_layers = int(params['lstm_num_layers'])
    # Set other LSTM hyperparameters similarly

    # Set Tabular CNN hyperparameters
    ensemble_model.tabular_cnn.layers = [int(params['tabular_cnn_layers_0']), int(params['tabular_cnn_layers_1'])]
    ensemble_model.tabular_cnn.ps = params['tabular_cnn_ps']
    ensemble_model.tabular_cnn.learning_rate = params['tabular_cnn_learning_rate']
    ensemble_model.tabular_cnn.dropout = params['tabular_cnn_dropout']
    ensemble_model.tabular_cnn.batch_size = int(params['tabular_cnn_batch_size'])
    # Set other Tabular CNN hyperparameters similarly

    # Fit the model on training data
    ensemble_model.fit(X_train, y_train)

    # Predict using the model
    ensemble_preds = ensemble_model.predict(X_val)

    # Calculate and return negative accuracy (to be maximized)
    return -accuracy_score(y_val, ensemble_preds)

In [None]:
# Create Bayesian Optimization object
bayes_optimizer = BayesianOptimization(f=objective_function, pbounds=param_bounds, random_state=42)

# Perform optimization
max_iter = 10  # Number of iterations
bayes_optimizer.maximize(init_points=5, n_iter=max_iter)

# Get the best hyperparameters
best_params = bayes_optimizer.max['params']
print("Best Hyperparameters:", best_params)

VIII. Neural Architectural Search (NAS)

In [None]:
# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the AutoKeras classifier using structured data block
clf_nas = ak.StructuredDataClassifier(max_trials=10, overwrite=True)

# Perform NAS for architecture search
clf_nas.fit(X_train, y_train, epochs=10, validation_data=(X_val, y_val))

# Get the best model architecture from NAS
best_nas_model = clf_nas.export_model()

In [None]:
# Define the ensemble models (XGBoost, LSTM, Tabular CNN)

# XGBoost
clf_xgboost = XGBClassifier(n_estimators=100, max_depth=5, learning_rate=0.1, random_state=42)

# LSTM
clf_lstm = Sequential()
clf_lstm.add(LSTM(units=128, activation='relu', input_shape=(X_train.shape[1], X_train.shape[2])))
clf_lstm.add(Dense(units=1, activation='sigmoid'))
clf_lstm.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Tabular CNN
tabular_cnn_layers = [64, 32]  # Example architecture
clf_tabular_cnn = TabularCNN(input_shape=(X_train.shape[1],), layers=tabular_cnn_layers, dropout=0.2, batch_size=32)

# Create a list of models for Voting Classifier
model_list = [('xgboost', clf_xgboost), ('lstm', clf_lstm), ('tabular_cnn', clf_tabular_cnn), ('nas', best_nas_model)]

In [None]:
# Define the objective function for Bayesian Optimization
def objective_function(**params):
    # Create the Voting Classifier with given hyperparameters
    ensemble_model = VotingClassifier(estimators=model_list, voting='soft')

    # Set hyperparameters for XGBoost, LSTM, Tabular CNN (similar to previous steps)
    clf_xgboost.n_estimators = int(params['xgboost_n_estimators'])
    clf_xgboost.max_depth = int(params['xgboost_max_depth'])

    clf_lstm.layers[0].units = int(params['lstm_hidden_size'])
    clf_lstm.layers[0].activation = 'relu'
    clf_lstm.layers[0].input_shape = (X_train.shape[1], X_train.shape[2])

    tabular_cnn_layers[0] = int(params['tabular_cnn_layers_0'])
    tabular_cnn_layers[1] = int(params['tabular_cnn_layers_1'])
    clf_tabular_cnn.layers = tabular_cnn_layers
    clf_tabular_cnn.dropout = params['tabular_cnn_dropout']
    clf_tabular_cnn.batch_size = int(params['tabular_cnn_batch_size'])

    # ... (same as previous code snippet for NAS-optimized architecture)

    # Get the architecture of the best NAS model
    best_nas_architecture = best_nas_model.get_config()

    # Create a new Sequential model using the best architecture
    clf_nas_optimized = Sequential.from_config(best_nas_architecture)

    # Compile the model
    clf_nas_optimized.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    # Fit the NAS-optimized model on training data
    clf_nas_optimized.fit(X_train, y_train, epochs=10, validation_data=(X_val, y_val))

    # Predict using the NAS-optimized model
    nas_optimized_preds = clf_nas_optimized.predict_classes(X_val)

    # Calculate and return negative accuracy (to be maximized)
    return -accuracy_score(y_val, nas_optimized_preds)

    # Fit the model on training data
    ensemble_model.fit(X_train, y_train)

    # Predict using the model
    ensemble_preds = ensemble_model.predict(X_val)

    # Calculate and return negative accuracy (to be maximized)
    return -accuracy_score(y_val, ensemble_preds)

In [None]:
# Define the parameter bounds for Bayesian Optimization
param_bounds = {
    'xgboost_n_estimators': (50, 300),
    'xgboost_max_depth': (3, 15),
    'lstm_hidden_size': (16, 256),
    'lstm_num_layers': (1, 3),
    'tabular_cnn_layers_0': (32, 128),
    'tabular_cnn_layers_1': (16, 64),
    'tabular_cnn_ps': (0.1, 0.5),
    'tabular_cnn_learning_rate': (0.001, 0.01),
    'tabular_cnn_dropout': (0.1, 0.5),
    'tabular_cnn_batch_size': (16, 128),
}

# Create Bayesian Optimization object
bayes_optimizer = BayesianOptimization(f=objective_function, pbounds=param_bounds, random_state=42)

# Perform optimization
max_iter = 10  # Number of iterations
bayes_optimizer.maximize(init_points=5, n_iter=max_iter)

# Get the best hyperparameters
best_params = bayes_optimizer.max['params']
print("Best Hyperparameters:", best_params)

EVALUATING PERFORMANCE

In [None]:
# Set hyperparameters for XGBoost
xgboost_params = {
    'n_estimators': 100,
    'max_depth': 6,
    'learning_rate': 0.1,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'gamma': 0,
    'min_child_weight': 1,
    'reg_alpha': 0,
    'reg_lambda': 1,
}

# Set hyperparameters for LSTM
lstm_params = {
    'units': 64,
    'dropout': 0.2,
    'batch_size': 32,
    'epochs': 10,
    'learning_rate': 0.001,
    'optimizer': 'adam',
    'loss': 'binary_crossentropy',
}


# Set hyperparameters for Tabular CNN
tabular_cnn_params = {
    'num_layers': 3,
    'hidden_units': [128, 64, 32],
    'dropout': 0.3,
    'batch_size': 64,
    'epochs': 20,
    'learning_rate': 0.001,
    'optimizer': 'adam',
    'loss': 'binary_crossentropy',
}

# Fit the ensemble model on training data
ensemble_model.fit(X_train, y_train)

# Predict using the ensemble model
ensemble_preds = ensemble_model.predict(X_val)

# Convert probabilities to binary predictions (using a threshold of 0.5)
ensemble_preds_binary = (ensemble_preds > 0.5).astype(int)

# Calculate and print evaluation metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

accuracy = accuracy_score(y_val, ensemble_preds_binary)
precision = precision_score(y_val, ensemble_preds_binary)
recall = recall_score(y_val, ensemble_preds_binary)
f1 = f1_score(y_val, ensemble_preds_binary)
conf_matrix = confusion_matrix(y_val, ensemble_preds_binary)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-Score:", f1)
print("Confusion Matrix:\n", conf_matrix)
