In [1]:
%pip install bloom_filter
%pip install tensorflow_hub

Collecting bloom_filter
  Downloading https://files.pythonhosted.org/packages/6f/85/c26819421801c5a04a2743e329641dde22225a55153d5477c032b4f7d40e/bloom_filter-1.3-py3-none-any.whl
Installing collected packages: bloom-filter
Successfully installed bloom-filter-1.3


#### Let us import all the modules required

In [21]:
from bloom_filter import BloomFilter
import tensorflow as tf 
import tensorflow_hub as hub
import numpy as np
import os
from tqdm.notebook import tqdm
from sklearn.metrics import classification_report,accuracy_score
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Input, Embedding, Activation, Flatten, Dense
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Dropout, Add, Bidirectional, LSTM,Attention,  SeparableConv1D
from tensorflow.keras.models import Model
from copy import deepcopy
from tensorflow.keras.utils import to_categorical
import plotly.graph_objects as go
import matplotlib.pyplot as plt
from tensorflow.keras.layers import Concatenate

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

####Now let us clean the data and make the dataset such that we have equal number of good and malicious URLs

In [9]:
def gen_data():
    fs = open("test_input.txt", "r",encoding='utf-8')
    X = []
    y = []
    c1 = 0
    c2 = 0
    for i, line in enumerate(fs.readlines()[1:]):
        url = line[:-5]
        label = line[-5:-1]
        url.strip(',')
        url.strip("")
        if ((c1==30000) and (c2==30000)):
          break
        if label==",bad":
          if c1<30000:
            c1+=1
            X.append(url)
            y.append(0)
        else:
          if c2<30000:
            c2+=1
            X.append(url)
            y.append(1)
    data = []
    for i in range(len(X)):
        data.append([X[i], y[i]])
    return data

####Now let us split the data into features and labels for training and testing

In [10]:
data = gen_data()
train_features = np.array([str(i[0]) for i in data])
train_labels = np.array([i[1] for i in data])

####Here we build the model and train the model on our dataset

In [41]:
# ========================= Tokenization & Preprocessing =========================
tk = Tokenizer(num_words=None, char_level=True, oov_token='UNK') 
input_size = 1014

# Alphabet & char dict
alphabet = "abcdefghijklmnopqrstuvwxyz0123456789,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{}"
char_dict = {char: i+1 for i, char in enumerate(alphabet)}
char_dict['UNK'] = len(char_dict) + 1
tk.word_index = char_dict

# Preprocess texts
train_texts = np.array(data)[:,0]
y_train = np.array(data)[:,1].astype(np.float32)  # binary labels

train_texts = [s.lower() for s in train_texts]
train_texts = tk.texts_to_sequences(train_texts)
train_data = pad_sequences(train_texts, maxlen=input_size, padding='post')

# ========================= Model Definition =========================
def residual_block(x, conv_block, name=None):
    shortcut = x
    out = conv_block(x)
    if shortcut.shape[-1] != out.shape[-1]:
        shortcut = Conv1D(out.shape[-1], 1, padding='same')(shortcut)
    return Add(name=name)([shortcut, out])

def hybrid_model(input_size, vocab_size,
                 emb_dim=128,
                 cnn_filters=64,
                 cnn_kernel=7,
                 cnn_pool=3,
                 bi_lstm_units=128,
                 fc_units=128):
    inp = Input(shape=(input_size,), name="input_ids")
    emb = Embedding(input_dim=vocab_size + 1,  # +1 for OOV token
                    output_dim=emb_dim,
                    name="shared_embedding")(inp)

    # Branch A: Char-CNN
    x = Conv1D(filters=cnn_filters, kernel_size=cnn_kernel, activation="relu", padding="same")(emb)
    x = MaxPooling1D(pool_size=cnn_pool)(x)
    x = Flatten()(x)
    branch_a = Dense(fc_units, activation="relu", name="branchA_fc")(x)

    # Branch B: CNN-BiLSTM+Attention with safe residual
    y = Conv1D(256, 5, activation="relu", padding="same")(emb)
    y = MaxPooling1D(2)(y)
    y = Conv1D(256, 3, activation="relu", padding="same")(y)
    y = residual_block(MaxPooling1D(2)(emb), lambda x: Conv1D(256, 3, activation="relu", padding="same")(x), name="branchB_residual")
    y = Bidirectional(LSTM(bi_lstm_units, return_sequences=True), name="branchB_bilstm")(y)
    y = Attention(name="branchB_attention")([y, y])
    y = Flatten()(y)
    branch_b = Dense(fc_units, activation="relu", name="branchB_fc")(y)

    # Merge & classify
    merged = Concatenate(name="fusion")([branch_a, branch_b])
    merged = Dropout(0.2)(merged)
    merged = Dense(fc_units, activation="relu", name="fusion_fc")(merged)
    out = Dense(1, activation="sigmoid", name="output")(merged)

    return Model(inputs=inp, outputs=out, name="Hybrid_CharCNN_CNNBiLSTM")

def fast_hybrid(input_size, vocab_size,
                      emb_dim=32, cnn_filters=64, cnn_kernel=5, cnn_pool=2,
                      bi_lstm_units=32, fc_units=16):
    inp = Input(shape=(input_size,), name="input_ids")
    emb = Embedding(vocab_size+1, emb_dim)(inp)

    # Branch A: single separable conv + pooling
    a = SeparableConv1D(cnn_filters, cnn_kernel, activation="relu", padding="same")(emb)
    a = MaxPooling1D(cnn_pool)(a)
    a = GlobalMaxPooling1D()(a)
    a = Dense(fc_units, activation="relu")(a)

    # Branch B: tiny BiLSTM
    b = Bidirectional(LSTM(bi_lstm_units))(emb)
    b = Dense(fc_units, activation="relu")(b)

    # Fuse
    m = Concatenate()([a, b])
    m = Dropout(0.2)(m)
    out = Dense(1, activation="sigmoid")(m)

    return Model(inp, out, name="UltraFast_Hybrid")

model = fast_hybrid(input_size, vocab_size)
model.compile('adam', 'binary_crossentropy', ['accuracy'])
model.summary()
model.fit(train_data[:, :input_size], y_train,
          batch_size=batch_size, epochs=epochs, verbose=2)



# ========================= Compile & Train =========================
vocab_size = len(char_dict)
model = fast_hybrid_model(input_size, vocab_size)

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

model.summary()

model.fit(train_data, y_train,
          batch_size=256,
          epochs=1,
          verbose=2)


Epoch 1/5


ValueError: dtype='string' is not a valid dtype for Keras type promotion.

In [26]:
def fast_charcnn(input_size, vocab_size, emb_dim=128):
    inp = Input(shape=(input_size,), name="input_ids")
    emb = Embedding(input_dim=vocab_size,
                    output_dim=emb_dim,
                    input_length=input_size,
                    name="embedding")(inp)

    x = SeparableConv1D(128, 7, activation="relu", padding="same")(emb)
    x = MaxPooling1D(3)(x)
    x = SeparableConv1D(128, 5, activation="relu", padding="same")(x)
    x = MaxPooling1D(3)(x)
    x = SeparableConv1D(128, 3, activation="relu", padding="same")(x)
    x = GlobalMaxPooling1D()(x)

    x = Dense(64, activation="relu")(x)
    x = Dropout(0.2)(x)
    out = Dense(1, activation="sigmoid")(x)

    model = Model(inputs=inp, outputs=out, name="Fast_CharCNN_Only")
    return model

model = fast_charcnn(input_size, vocab_size)
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
model.summary()
model.fit(train_data, train_classes,
          batch_size=64,
          epochs=1,
          verbose=2)

vocab_size = len(tk.word_index) + 1



938/938 - 62s - 66ms/step - accuracy: 0.8855 - loss: 0.1688


####This is a function to test our model

In [33]:
def test_model(test_texts):
    test_texts = [s.lower() for s in test_texts]
    test_texts = tk.texts_to_sequences(test_texts)
    test_data = pad_sequences(test_texts, maxlen=input_size, padding='post')
    test_data = np.array(test_data, dtype='float32')
    y = model.predict(test_data, verbose=0)
    return y.flatten()  # Returns array of shape (N,)


####This is a function to add elements to the backup bloom filter after passing the data through classifier and it returns the backup bloom filter.


In [34]:
def Train_Bloom2(bloom, train_features, train_labels, tau):
    preds = test_model(train_features)
    for i in range(len(preds)):
        if preds[i] < tau:
            if train_labels[i] == 1:
                bloom.add(str(train_features[i]))
    return bloom


####This is a function to test the Sandwich Learned Bloom Filter

In [35]:
def Test_SLBF(bloom1, bloom2, data, tau, prediction):
    output1 = []
    for i in range(len(data)):
        if str(data[i]) not in bloom1:
            output1.append(0)
            continue
        if prediction[i] > tau:
            output1.append(1)
        elif str(data[i]) in bloom2:
            output1.append(1)
        else:
            output1.append(0)
    return np.array(output1)


####This is a function to test the Normal Learned Bloom Filter

In [36]:
def Test_NLBF(bloom2, data, tau, prediction):
    output1 = []
    for i in range(len(data)):
        if prediction[i] > tau:
            output1.append(1)
        elif str(data[i]) in bloom2:
            output1.append(1)
        else:
            output1.append(0)
    return np.array(output1)


####This is a function to test the regular Bloom Filter

In [37]:
def Test_BF(bloom1, test_data):
    y_pred_bloom = []
    for i in test_data:
        if str(i) in bloom1:
            y_pred_bloom.append(1)
        else:
            y_pred_bloom.append(0)
    return np.array(y_pred_bloom)


####Now we test all the implemented filters and gather their accuracies for comparision

In [38]:
error_rates = [0.01 * i for i in range(1, 11)]
tau = 0.9
accuracies = []

for er in tqdm(error_rates):
    classifier_data = []
    bloom1 = BloomFilter(max_elements=25000, error_rate=er)
    bloom2 = BloomFilter(max_elements=25000, error_rate=er)

    # Add all positives to bloom1
    for data_point in data:
        if data_point[1] == 1:
            bloom1.add(data_point[0])

    # Create classifier data (present in bloom1)
    for data_point in data:
        if data_point[0] in bloom1:
            classifier_data.append(data_point)

    # Train bloom2 using current classifier
    bloom2 = Train_Bloom2(bloom2, train_features, train_labels, tau)

    # Full dataset test
    test_data = np.array([i[0] for i in data])
    y_true = np.array([i[1] for i in data])
    prediction = test_model(test_data)

    y_pred_sandwich = Test_SLBF(bloom1, bloom2, test_data, tau, prediction)
    y_pred_normal = Test_NLBF(bloom2, test_data, tau, prediction)
    y_pred_bloom = Test_BF(bloom1, test_data)

    accuracies.append([
        accuracy_score(y_true, y_pred_sandwich),
        accuracy_score(y_true, y_pred_normal),
        accuracy_score(y_true, y_pred_bloom)
    ])


  0%|          | 0/10 [00:00<?, ?it/s]

####Now let us plot the accuracies of all the filters with variying error rates

In [40]:
import plotly.graph_objects as go

normal = [100 * acc[1] for acc in accuracies]
bloom = [100 * acc[2] for acc in accuracies]

t = error_rates  

# Interactive plot
fig = go.Figure()


# Neural LBF
fig.add_trace(go.Scatter(
    x=t, y=normal,
    mode='lines+markers',
    name='Neural LBF',
    line=dict(color='green'),
    marker=dict(size=8)
))

# Plain Bloom Filter
fig.add_trace(go.Scatter(
    x=t, y=bloom,
    mode='lines+markers',
    name='Bloom Filter',
    line=dict(color='red'),
    marker=dict(size=8)
))

# Layout
fig.update_layout(
    title="Comparison of Learned Bloom Filters vs Plain Bloom Filter",
    xaxis_title="Bloom Filter Error Rate",
    yaxis_title="Accuracy (%)",
    legend=dict(
        x=0.01, y=0.99,
        bgcolor='rgba(255,255,255,0)',
        bordercolor='rgba(0,0,0,0)'
    ),
    template='plotly_white',
    height=720,
    width=1280,
    font=dict(size=16)
)

fig.show()
