In [1]:
import pandas as pd
import numpy as np

In [2]:
df1=pd.read_parquet("final.parquet")
clients=pd.read_csv("client_hostname.csv")

In [3]:
top_requests = df1['request'].value_counts().head(52).index

In [4]:
df2 = df1[df1['request'].isin(top_requests)].copy()

In [5]:
import string
charset = list(string.ascii_lowercase + string.ascii_uppercase)
unique_requests = df2['request'].unique()
encoding_map = {req: charset[i] for i, req in enumerate(unique_requests)}
df2['encoded_request'] = df2['request'].map(encoding_map)

In [7]:
df3 = df2.groupby('client')['encoded_request'].apply(lambda x: ''.join(x)).reset_index()
df3.columns = ['client', 'request_sequence']

In [8]:
min_length = 50
df = df3[df3['request_sequence'].str.len() >= min_length].copy()

In [9]:
import networkx as nx
from node2vec import Node2Vec

G = nx.DiGraph()
for seq in df['request_sequence']:
    for i in range(len(seq) - 1):
        G.add_edge(seq[i], seq[i + 1])

node2vec = Node2Vec(G, dimensions=128, walk_length=10, num_walks=50, workers=4)
model = node2vec.fit(window=5, min_count=1)

def get_sequence_embedding(sequence, model):
    vectors = [model.wv[char] for char in sequence if char in model.wv]
    return sum(vectors) / len(vectors) if vectors else [0] * model.vector_size

df['vector_representation_node2vec'] = df['request_sequence'].apply(lambda x: get_sequence_embedding(x, model))


Computing transition probabilities:   0%|          | 0/52 [00:00<?, ?it/s]

In [10]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense

all_chars = sorted(set(''.join(df['request_sequence'])))
char_to_index = {char: i for i, char in enumerate(all_chars)}

def sequence_to_vector(sequence, char_to_index):
    vector = np.zeros(len(char_to_index))
    for char in sequence:
        if char in char_to_index:
            vector[char_to_index[char]] += 1
    return vector

df['vectorized'] = df['request_sequence'].apply(lambda x: sequence_to_vector(x, char_to_index))

input_dim = len(char_to_index)
encoding_dim = 128

input_seq = Input(shape=(input_dim,))
encoded = Dense(encoding_dim, activation='relu')(input_seq)
decoded = Dense(input_dim, activation='sigmoid')(encoded)

autoencoder = Model(input_seq, decoded)
encoder = Model(input_seq, encoded)

autoencoder.compile(optimizer='adam', loss='binary_crossentropy')

X = np.array(list(df['vectorized']))
autoencoder.fit(X, X, epochs=50, batch_size=2, shuffle=True)

df['vector_representation_autoencoder'] = list(encoder.predict(X))


Epoch 1/50
[1m22571/22571[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 3ms/step - loss: -1466647.6250
Epoch 2/50
[1m22571/22571[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m76s[0m 3ms/step - loss: -10233596.0000
Epoch 3/50
[1m22571/22571[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m74s[0m 3ms/step - loss: -24927110.0000
Epoch 4/50
[1m22571/22571[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m74s[0m 3ms/step - loss: -28069730.0000
Epoch 5/50
[1m22571/22571[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68s[0m 3ms/step - loss: -33339958.0000
Epoch 6/50
[1m22571/22571[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m72s[0m 3ms/step - loss: -54267820.0000
Epoch 7/50
[1m22571/22571[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 3ms/step - loss: -80674680.0000
Epoch 8/50
[1m22571/22571[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m72s[0m 3ms/step - loss: -100171512.0000
Epoch 9/50
[1m22571/22571[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 3ms/st

In [11]:
import pywt
from sklearn.preprocessing import StandardScaler

def adaptive_wavelet_transform(sequence, wavelets=['db4', 'coif2'], max_level=5, output_length=128):
    best_wavelet = None
    best_coeffs = None
    max_energy = -np.inf

    for wavelet in wavelets:
        level = min(max_level, int(np.floor(np.log2(len(sequence)))))
        coeffs = pywt.wavedec(sequence, wavelet, level=level)
        flat_coeffs = np.hstack(coeffs)
        energy = np.sum(flat_coeffs ** 2)

        if energy > max_energy:
            max_energy = energy
            best_wavelet = wavelet
            best_coeffs = flat_coeffs

    best_coeffs = best_coeffs[:output_length] if len(best_coeffs) > output_length else np.pad(
        best_coeffs, (0, output_length - len(best_coeffs))
    )
    return best_coeffs

df['numeric_sequence'] = df['request_sequence'].apply(lambda seq: [ord(char) for char in seq])

output_length = 128
df['vector_representation_adaptive_wavelet'] = df['numeric_sequence'].apply(
    lambda seq: adaptive_wavelet_transform(seq, wavelets=['db4', 'coif2', 'sym5'], max_level=5, output_length=output_length)
)

combined_vectors = np.vstack(df['vector_representation_adaptive_wavelet'].values)
scaler = StandardScaler()
X = scaler.fit_transform(combined_vectors)





In [12]:
from gensim.models import Word2Vec
sequences = [list(seq) for seq in df['request_sequence']]
word2vec_model = Word2Vec(sequences, vector_size=128, window=2, min_count=1, sg=1, workers=4)
def sequence_to_vector(sequence):
    vectors = [word2vec_model.wv[char] for char in sequence if char in word2vec_model.wv]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(word2vec_model.vector_size)

df['vector_representation_word2vec'] = df['request_sequence'].apply(lambda x: sequence_to_vector(list(x)))

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

tfidf_vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(2, 10))
tfidf_matrix = tfidf_vectorizer.fit_transform(df['request_sequence'])
svd = TruncatedSVD(n_components=128)
reduced_tfidf = svd.fit_transform(tfidf_matrix)

df['vector_representation_tfidf'] = list(reduced_tfidf)


In [14]:
from collections import defaultdict

def build_transition_matrix(sequences, n):
    transition_counts = defaultdict(lambda: defaultdict(int))
    for seq in sequences:
        for i in range(len(seq) - n + 1):
            ngram = seq[i:i + n - 1]
            next_char = seq[i + n - 1]
            transition_counts[ngram][next_char] += 1

    transition_matrix = {}
    for ngram, next_chars in transition_counts.items():
        total = sum(next_chars.values())
        transition_matrix[ngram] = {char: count / total for char, count in next_chars.items()}
    
    return transition_matrix

def get_all_next_chars(transition_matrices):
    all_next_chars = set()
    for tm in transition_matrices:
        for next_chars in tm.values():
            all_next_chars.update(next_chars.keys())
    return list(all_next_chars)


def sequence_to_embedding(sequence, transition_matrices, all_next_chars, n_range):
    char_to_index = {char: i for i, char in enumerate(all_next_chars)}
    embedding_dim = len(all_next_chars)
    embedding = np.zeros(embedding_dim)
    
    for n, tm in zip(n_range, transition_matrices):
        for i in range(len(sequence) - n + 1):
            ngram = sequence[i:i + n - 1]
            if ngram in tm:
                for char, prob in tm[ngram].items():
                    embedding[char_to_index[char]] += prob
    return embedding / len(sequence) if len(sequence) >= min(n_range) else embedding


n_range = range(2, 10)
transition_matrices = [build_transition_matrix(df['request_sequence'], n=n) for n in n_range]

all_next_chars = get_all_next_chars(transition_matrices)

try:
    df['vector_representation_markov'] = df['request_sequence'].apply(
        lambda x: sequence_to_embedding(x, transition_matrices, all_next_chars, n_range)
    )
except Exception as e:
    print("Error:", e)


def sinusoidal_expand_embedding(embedding, target_dim=128, noise=1e-6):
    expanded_embedding = np.zeros(target_dim)
    expanded_embedding[:len(embedding)] = embedding
    for i in range(len(embedding), target_dim):
        expanded_embedding[i] = (np.sin(i) if i % 2 == 0 else np.cos(i)) + np.random.uniform(-noise, noise)
    return expanded_embedding

df['vector_representation_markov_sinusoidal_expanded'] = df['vector_representation_markov'].apply(
    lambda x: sinusoidal_expand_embedding(x, target_dim=128)
)


In [27]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

vector_columns = ['vector_representation_node2vec','vector_representation_adaptive_wavelet','vector_representation_autoencoder','vector_representation_word2vec','vector_representation_tfidf','vector_representation_markov_sinusoidal_expanded']

#for col in vector_columns:
#    df[col] = df[col].apply(lambda x: np.array(x).flatten() if isinstance(x, (list, np.ndarray)) else x)

combined_vectors = np.hstack([np.vstack(df[col]) for col in vector_columns])
scaler = StandardScaler()
X = scaler.fit_transform(combined_vectors)

input_dim = X.shape[1]

input_layer = Input(shape=(input_dim,))
encoded = Dense(128, activation='relu')(input_layer)
encoded = Dense(64, activation='relu')(encoded)
bottleneck = Dense(64, activation='relu')(encoded)

decoded = Dense(64, activation='relu')(bottleneck)
decoded = Dense(128, activation='relu')(decoded)
output_layer = Dense(input_dim, activation='sigmoid')(decoded)

autoencoder = Model(inputs=input_layer, outputs=output_layer)
autoencoder.compile(optimizer='adam', loss='mse')

autoencoder.fit(X, X, epochs=50, batch_size=128, shuffle=True)

encoder = Model(inputs=input_layer, outputs=bottleneck)
reduced_vectors = encoder.predict(X)

kmeans = KMeans(n_clusters=2, random_state=42)
df['cluster'] = kmeans.fit_predict(reduced_vectors)

print(df.groupby('cluster').size())


Epoch 1/50
[1m353/353[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - loss: 0.9053
Epoch 2/50
[1m353/353[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - loss: 0.9380
Epoch 3/50
[1m353/353[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - loss: 1.0042
Epoch 4/50
[1m353/353[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - loss: 0.8461
Epoch 5/50
[1m353/353[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - loss: 0.8822
Epoch 6/50
[1m353/353[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - loss: 0.9919
Epoch 7/50
[1m353/353[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - loss: 0.7937
Epoch 8/50
[1m353/353[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - loss: 0.9835
Epoch 9/50
[1m353/353[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - loss: 0.7886
Epoch 10/50
[1m353/353[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - lo

## Number of IPs that are well known to be bots in two separate clusters

In [30]:
import re
bots=df1[df1['user_agent'].str.contains('bot|semrush|ahrefs|petal|bing|pingdom|facebook|headless',flags=re.IGNORECASE,regex=True)]['client'].unique()
print(df[df['cluster'] == 0]['client'].isin(bots).value_counts(),df[df['cluster'] == 1]['client'].isin(bots).value_counts())

client
False    42700
True       133
Name: count, dtype: int64 client
False    1732
True      576
Name: count, dtype: int64


## In the small cluster, what are the hostnames of the IPs other than the well-known bot IPs? Are Data Center, Cloud, TOR hostnames very prominent?

In [35]:
print(clients[clients['client'].isin(df[(df['cluster']==1) & (~df['client'].isin(bots))]['client'])].to_markdown())

|        | client          | hostname                                      | alias_list                                                     | address_list        |
|-------:|:----------------|:----------------------------------------------|:---------------------------------------------------------------|:--------------------|
|     55 | 89.199.157.16   | 89.199.157.16                                 | [Errno 1] Unknown host                                         | nan                 |
|     68 | 185.101.216.150 | 185.101.216.150                               | [Errno 1] Unknown host                                         | nan                 |
|    269 | 85.239.213.30   | 85.239.213.30                                 | [Errno 1] Unknown host                                         | nan                 |
|    328 | 94.24.17.148    | 94.24.17.148                                  | [Errno 1] Unknown host                                         | nan                 |
|    458 | 5.236